diff --git a/.chloggen/feat_ottl_xml-parse-function.yaml b/.chloggen/feat_ottl_xml-parse-function.yaml new file mode 100755 index 0000000000000..710eedae3f487 --- /dev/null +++ b/.chloggen/feat_ottl_xml-parse-function.yaml @@ -0,0 +1,13 @@ +# Use this changelog template to create an entry for release notes. + +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: "enhancement" + +# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver) +component: pkg/ottl + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Add `ParseXML` function for parsing XML from a target string. + +# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists. +issues: [31133] diff --git a/pkg/ottl/e2e/e2e_test.go b/pkg/ottl/e2e/e2e_test.go index f850af9b0aa36..315a00e8c28b8 100644 --- a/pkg/ottl/e2e/e2e_test.go +++ b/pkg/ottl/e2e/e2e_test.go @@ -485,6 +485,22 @@ func Test_e2e_converters(t *testing.T) { m.PutStr("k2", "v2__!__v2") }, }, + { + statement: `set(attributes["test"], ParseXML("This is a log message!"))`, + want: func(tCtx ottllog.TransformContext) { + log := tCtx.GetLogRecord().Attributes().PutEmptyMap("test") + log.PutStr("tag", "Log") + + attrs := log.PutEmptyMap("attributes") + attrs.PutStr("id", "1") + + logChildren := log.PutEmptySlice("children") + + message := logChildren.AppendEmpty().SetEmptyMap() + message.PutStr("tag", "Message") + message.PutStr("content", "This is a log message!") + }, + }, { statement: `set(attributes["test"], Seconds(Duration("1m")))`, want: func(tCtx ottllog.TransformContext) { diff --git a/pkg/ottl/ottlfuncs/README.md b/pkg/ottl/ottlfuncs/README.md index a87b6562f57bf..94712ca3074db 100644 --- a/pkg/ottl/ottlfuncs/README.md +++ b/pkg/ottl/ottlfuncs/README.md @@ -403,6 +403,7 @@ Available Converters: - [ParseCSV](#parsecsv) - [ParseJSON](#parsejson) - [ParseKeyValue](#parsekeyvalue) +- [ParseXML](#parsexml) - [Seconds](#seconds) - [SHA1](#sha1) - [SHA256](#sha256) @@ -913,6 +914,78 @@ Examples: - `ParseKeyValue(attributes["pairs"])` +### ParseXML + +`ParseXML(target)` + +The `ParseXML` Converter returns a `pcommon.Map` struct that is the result of parsing the target string as an XML document. + +`target` is a Getter that returns a string. This string should be in XML format. +If `target` is not a string, nil, or cannot be parsed as XML, `ParseXML` will return an error. + +Unmarshalling XML is done using the following rules: +1. All character data for an XML element is trimmed, joined, and placed into the `content` field. +2. The tag for an XML element is trimmed, and placed into the `tag` field. +3. The attributes for an XML element is placed as a `pcommon.Map` into the `attribute` field. +4. Processing instructions, directives, and comments are ignored and not represented in the resultant map. +5. All child elements are parsed as above, and placed in a `pcommon.Slice`, which is then placed into the `children` field. + +For example, the following XML document: +```xml + + + + 00001 + Joe + joe.smith@example.com + + User fired alert A + +``` + +will be parsed as: +```json +{ + "tag": "Log", + "children": [ + { + "tag": "User", + "children": [ + { + "tag": "ID", + "content": "00001" + }, + { + "tag": "Name", + "content": "Joe", + "attributes": { + "type": "first" + } + }, + { + "tag": "Email", + "content": "joe.smith@example.com" + } + ] + }, + { + "tag": "Text", + "content": "User fired alert A" + } + ] +} +``` + +Examples: + +- `ParseXML(body)` + +- `ParseXML(attributes["xml"])` + +- `ParseXML("")` + + + ### Seconds `Seconds(value)` diff --git a/pkg/ottl/ottlfuncs/func_parse_xml.go b/pkg/ottl/ottlfuncs/func_parse_xml.go new file mode 100644 index 0000000000000..42dac93307dfb --- /dev/null +++ b/pkg/ottl/ottlfuncs/func_parse_xml.go @@ -0,0 +1,134 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package ottlfuncs // import "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/ottl/ottlfuncs" + +import ( + "bytes" + "context" + "encoding/xml" + "errors" + "fmt" + "strings" + + "go.opentelemetry.io/collector/pdata/pcommon" + + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/ottl" +) + +type ParseXMLArguments[K any] struct { + Target ottl.StringGetter[K] +} + +func NewParseXMLFactory[K any]() ottl.Factory[K] { + return ottl.NewFactory("ParseXML", &ParseXMLArguments[K]{}, createParseXMLFunction[K]) +} + +func createParseXMLFunction[K any](_ ottl.FunctionContext, oArgs ottl.Arguments) (ottl.ExprFunc[K], error) { + args, ok := oArgs.(*ParseXMLArguments[K]) + + if !ok { + return nil, fmt.Errorf("ParseXMLFactory args must be of type *ParseXMLArguments[K]") + } + + return parseXML(args.Target), nil +} + +// parseXML returns a `pcommon.Map` struct that is a result of parsing the target string as XML +func parseXML[K any](target ottl.StringGetter[K]) ottl.ExprFunc[K] { + return func(ctx context.Context, tCtx K) (any, error) { + targetVal, err := target.Get(ctx, tCtx) + if err != nil { + return nil, err + } + + parsedXML := xmlElement{} + + decoder := xml.NewDecoder(strings.NewReader(targetVal)) + err = decoder.Decode(&parsedXML) + if err != nil { + return nil, fmt.Errorf("unmarshal xml: %w", err) + } + + if decoder.InputOffset() != int64(len(targetVal)) { + return nil, errors.New("trailing bytes after parsing xml") + } + + parsedMap := pcommon.NewMap() + parsedXML.intoMap(parsedMap) + + return parsedMap, nil + } +} + +type xmlElement struct { + tag string + attributes []xml.Attr + text string + children []xmlElement +} + +// UnmarshalXML implements xml.Unmarshaler for xmlElement +func (a *xmlElement) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { + a.tag = start.Name.Local + a.attributes = start.Attr + + for { + tok, err := d.Token() + if err != nil { + return fmt.Errorf("decode next token: %w", err) + } + + switch t := tok.(type) { + case xml.StartElement: + child := xmlElement{} + err := d.DecodeElement(&child, &t) + if err != nil { + return err + } + + a.children = append(a.children, child) + case xml.EndElement: + // End element means we've reached the end of parsing + return nil + case xml.CharData: + // Strip leading/trailing spaces to ignore newlines and + // indentation in formatted XML + a.text += string(bytes.TrimSpace([]byte(t))) + case xml.Comment: // ignore comments + case xml.ProcInst: // ignore processing instructions + case xml.Directive: // ignore directives + default: + return fmt.Errorf("unexpected token type %T", t) + } + } +} + +// intoMap converts and adds the xmlElement into the provided pcommon.Map. +func (a xmlElement) intoMap(m pcommon.Map) { + m.EnsureCapacity(4) + + m.PutStr("tag", a.tag) + + if a.text != "" { + m.PutStr("content", a.text) + } + + if len(a.attributes) > 0 { + attrs := m.PutEmptyMap("attributes") + attrs.EnsureCapacity(len(a.attributes)) + + for _, attr := range a.attributes { + attrs.PutStr(attr.Name.Local, attr.Value) + } + } + + if len(a.children) > 0 { + children := m.PutEmptySlice("children") + children.EnsureCapacity(len(a.children)) + + for _, child := range a.children { + child.intoMap(children.AppendEmpty().SetEmptyMap()) + } + } +} diff --git a/pkg/ottl/ottlfuncs/func_parse_xml_test.go b/pkg/ottl/ottlfuncs/func_parse_xml_test.go new file mode 100644 index 0000000000000..8c348d3a6e762 --- /dev/null +++ b/pkg/ottl/ottlfuncs/func_parse_xml_test.go @@ -0,0 +1,309 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package ottlfuncs + +import ( + "context" + "fmt" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.opentelemetry.io/collector/pdata/pcommon" + + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/ottl" +) + +func Test_ParseXML(t *testing.T) { + tests := []struct { + name string + oArgs ottl.Arguments + want map[string]any + createError string + parseError string + }{ + { + name: "Text values in nested elements", + oArgs: &ParseXMLArguments[any]{ + Target: ottl.StandardStringGetter[any]{ + Getter: func(_ context.Context, _ any) (any, error) { + return "00001Joejoe.smith@example.comUser did a thing", nil + }, + }, + }, + want: map[string]any{ + "tag": "Log", + "children": []any{ + map[string]any{ + "tag": "User", + "children": []any{ + map[string]any{ + "tag": "ID", + "content": "00001", + }, + map[string]any{ + "tag": "Name", + "content": "Joe", + }, + map[string]any{ + "tag": "Email", + "content": "joe.smith@example.com", + }, + }, + }, + map[string]any{ + "tag": "Text", + "content": "User did a thing", + }, + }, + }, + }, + { + name: "Formatted example", + oArgs: &ParseXMLArguments[any]{ + Target: ottl.StandardStringGetter[any]{ + Getter: func(_ context.Context, _ any) (any, error) { + return ` + + + 00001 + Joe + joe.smith@example.com + + User did a thing + `, nil + }, + }, + }, + want: map[string]any{ + "tag": "Log", + "children": []any{ + map[string]any{ + "tag": "User", + "children": []any{ + map[string]any{ + "tag": "ID", + "content": "00001", + }, + map[string]any{ + "tag": "Name", + "content": "Joe", + }, + map[string]any{ + "tag": "Email", + "content": "joe.smith@example.com", + }, + }, + }, + map[string]any{ + "tag": "Text", + "content": "User did a thing", + }, + }, + }, + }, + { + name: "Multiple tags with the same name", + oArgs: &ParseXMLArguments[any]{ + Target: ottl.StandardStringGetter[any]{ + Getter: func(_ context.Context, _ any) (any, error) { + return `This record has a collision`, nil + }, + }, + }, + want: map[string]any{ + "tag": "Log", + "content": "This record has a collision", + "children": []any{ + map[string]any{ + "tag": "User", + "attributes": map[string]any{ + "id": "0001", + }, + }, + map[string]any{ + "tag": "User", + "attributes": map[string]any{ + "id": "0002", + }, + }, + }, + }, + }, + { + name: "Multiple lines of content", + oArgs: &ParseXMLArguments[any]{ + Target: ottl.StandardStringGetter[any]{ + Getter: func(_ context.Context, _ any) (any, error) { + return ` + This record has multiple lines of + + text content + `, nil + }, + }, + }, + want: map[string]any{ + "tag": "Log", + "content": "This record has multiple lines oftext content", + "children": []any{ + map[string]any{ + "tag": "User", + "attributes": map[string]any{ + "id": "0001", + }, + }, + }, + }, + }, + { + name: "Attribute only element", + oArgs: &ParseXMLArguments[any]{ + Target: ottl.StandardStringGetter[any]{ + Getter: func(_ context.Context, _ any) (any, error) { + return ``, nil + }, + }, + }, + want: map[string]any{ + "tag": "HostInfo", + "attributes": map[string]any{ + "hostname": "example.com", + "zone": "east-1", + "cloudprovider": "aws", + }, + }, + }, + { + name: "Ignores XML declaration", + oArgs: &ParseXMLArguments[any]{ + Target: ottl.StandardStringGetter[any]{ + Getter: func(_ context.Context, _ any) (any, error) { + return `Log content`, nil + }, + }, + }, + want: map[string]any{ + "tag": "Log", + "content": "Log content", + }, + }, + { + name: "Ignores comments", + oArgs: &ParseXMLArguments[any]{ + Target: ottl.StandardStringGetter[any]{ + Getter: func(_ context.Context, _ any) (any, error) { + return `This has a comment `, nil + }, + }, + }, + want: map[string]any{ + "tag": "Log", + "content": "This has a comment", + }, + }, + { + name: "Ignores processing instructions", + oArgs: &ParseXMLArguments[any]{ + Target: ottl.StandardStringGetter[any]{ + Getter: func(_ context.Context, _ any) (any, error) { + return `Log content`, nil + }, + }, + }, + want: map[string]any{ + "tag": "Log", + "content": "Log content", + }, + }, + { + name: "Ignores directives", + oArgs: &ParseXMLArguments[any]{ + Target: ottl.StandardStringGetter[any]{ + Getter: func(_ context.Context, _ any) (any, error) { + return `Log content`, nil + }, + }, + }, + want: map[string]any{ + "tag": "Log", + "content": "Log content", + }, + }, + { + name: "Missing closing element", + oArgs: &ParseXMLArguments[any]{ + Target: ottl.StandardStringGetter[any]{ + Getter: func(_ context.Context, _ any) (any, error) { + return ``, nil + }, + }, + }, + parseError: "unmarshal xml: decode next token: XML syntax error on line 1: unexpected EOF", + }, + { + name: "Missing nested closing element", + oArgs: &ParseXMLArguments[any]{ + Target: ottl.StandardStringGetter[any]{ + Getter: func(_ context.Context, _ any) (any, error) { + return ``, nil + }, + }, + }, + parseError: "unmarshal xml: decode next token: XML syntax error on line 1: element closed by ", + }, + { + name: "Multiple XML elements in payload (trailing bytes)", + oArgs: &ParseXMLArguments[any]{ + Target: ottl.StandardStringGetter[any]{ + Getter: func(_ context.Context, _ any) (any, error) { + return ``, nil + }, + }, + }, + parseError: "trailing bytes after parsing xml", + }, + { + name: "Error getting target", + oArgs: &ParseXMLArguments[any]{ + Target: ottl.StandardStringGetter[any]{ + Getter: func(_ context.Context, _ any) (any, error) { + return "", fmt.Errorf("failed to get string") + }, + }, + }, + parseError: "error getting value in ottl.StandardStringGetter[interface {}]: failed to get string", + }, + { + name: "Invalid arguments", + oArgs: nil, + createError: "ParseXMLFactory args must be of type *ParseXMLArguments[K]", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + exprFunc, err := createParseXMLFunction[any](ottl.FunctionContext{}, tt.oArgs) + if tt.createError != "" { + require.ErrorContains(t, err, tt.createError) + return + } + + require.NoError(t, err) + + result, err := exprFunc(context.Background(), nil) + if tt.parseError != "" { + require.ErrorContains(t, err, tt.parseError) + return + } + + assert.NoError(t, err) + + resultMap, ok := result.(pcommon.Map) + require.True(t, ok) + + require.Equal(t, tt.want, resultMap.AsRaw()) + }) + } +} diff --git a/pkg/ottl/ottlfuncs/functions.go b/pkg/ottl/ottlfuncs/functions.go index 1f419a746e42a..9bb33ff3230f0 100644 --- a/pkg/ottl/ottlfuncs/functions.go +++ b/pkg/ottl/ottlfuncs/functions.go @@ -61,6 +61,7 @@ func converters[K any]() []ottl.Factory[K] { NewParseCSVFactory[K](), NewParseJSONFactory[K](), NewParseKeyValueFactory[K](), + NewParseXMLFactory[K](), NewSecondsFactory[K](), NewSHA1Factory[K](), NewSHA256Factory[K](),