Skip to content

Commit

Permalink
enhancement: improve content extraction stop word cleaning
Browse files Browse the repository at this point in the history
  • Loading branch information
fschade committed Oct 23, 2023
1 parent 9283aad commit 8134151
Show file tree
Hide file tree
Showing 6 changed files with 72 additions and 5 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Enhancement: Tika content extraction cleanup for search

So far it has not been possible to determine whether
the content for the search should be cleaned of stop words or not.

This can now be set with the newly introduced settings option `SEARCH_EXTRACTOR_TIKA_CLEAN_STOP_WORDS=false`
which is enabled by default.

In addition, the stop word cleanup is no longer as aggressive and now ignores numbers, urls,
basically everything except the defined stop words.

https://github.com/owncloud/ocis/pull/7553
https://github.com/owncloud/ocis/issues/6674
3 changes: 2 additions & 1 deletion services/search/pkg/config/content.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@ type Extractor struct {

// ExtractorTika configures the Tika extractor
type ExtractorTika struct {
TikaURL string `yaml:"tika_url" env:"SEARCH_EXTRACTOR_TIKA_TIKA_URL" desc:"URL of the tika server."`
TikaURL string `yaml:"tika_url" env:"SEARCH_EXTRACTOR_TIKA_TIKA_URL" desc:"URL of the tika server."`
CleanStopWords bool `yaml:"clean_stop_words" env:"SEARCH_EXTRACTOR_TIKA_CLEAN_STOP_WORDS" desc:"Defines if stop words should be cleaned or not."`
}
3 changes: 2 additions & 1 deletion services/search/pkg/config/defaults/defaultconfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ func DefaultConfig() *config.Config {
Type: "basic",
CS3AllowInsecure: false,
Tika: config.ExtractorTika{
TikaURL: "http://127.0.0.1:9998",
TikaURL: "http://127.0.0.1:9998",
CleanStopWords: true,
},
},
Events: config.Events{
Expand Down
14 changes: 14 additions & 0 deletions services/search/pkg/content/content.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
package content

import (
"strings"

"github.com/bbalet/stopwords"
)

func init() {
stopwords.OverwriteWordSegmenter(`[^ ]+`)
}

// Document wraps all resource meta fields,
// it is used as a content extraction result.
type Document struct {
Expand All @@ -11,3 +21,7 @@ type Document struct {
MimeType string
Tags []string
}

func CleanString(content, langCode string) string {
return strings.TrimSpace(stopwords.CleanString(content, langCode, true))
}
36 changes: 36 additions & 0 deletions services/search/pkg/content/content_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package content_test

import (
"testing"

. "github.com/stretchr/testify/assert"

"github.com/owncloud/ocis/v2/services/search/pkg/content"
)

func TestCleanContent(t *testing.T) {
tests := []struct {
given string
expect string
}{
{
given: "find can keeper should keeper will",
expect: "keeper keeper",
},
{
given: "user1 shares the file to Marie",
expect: "user1 shares file marie",
},
{
given: "content contains https://localhost/remote.php/dav/files/admin/Photos/San%20Francisco.jpg and stop word",
expect: "content contains https://localhost/remote.php/dav/files/admin/photos/san%20francisco.jpg stop word",
},
}

for _, tc := range tests {
tc := tc
t.Run(tc.given, func(t *testing.T) {
Equal(t, tc.expect, content.CleanString(tc.given, "en"))
})
}
}
8 changes: 5 additions & 3 deletions services/search/pkg/content/tika.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@ import (
"fmt"
"strings"

"github.com/bbalet/stopwords"
gateway "github.com/cs3org/go-cs3apis/cs3/gateway/v1beta1"
provider "github.com/cs3org/go-cs3apis/cs3/storage/provider/v1beta1"
"github.com/cs3org/reva/v2/pkg/rgrpc/todo/pool"
"github.com/google/go-tika/tika"

"github.com/owncloud/ocis/v2/ocis-pkg/log"
"github.com/owncloud/ocis/v2/services/search/pkg/config"
)
Expand All @@ -21,6 +21,7 @@ type Tika struct {
Retriever
tika *tika.Client
contentExtractionSizeLimit uint64
cleanStopWords bool
}

// NewTikaExtractor creates a new Tika instance.
Expand All @@ -42,6 +43,7 @@ func NewTikaExtractor(gatewaySelector pool.Selectable[gateway.GatewayAPIClient],
Retriever: newCS3Retriever(gatewaySelector, logger, cfg.Extractor.CS3AllowInsecure),
tika: tika.NewClient(nil, cfg.Extractor.Tika.TikaURL),
contentExtractionSizeLimit: cfg.ContentExtractionSizeLimit,
cleanStopWords: cfg.Extractor.Tika.CleanStopWords,
}, nil
}

Expand Down Expand Up @@ -86,8 +88,8 @@ func (t Tika) Extract(ctx context.Context, ri *provider.ResourceInfo) (Document,
}
}

if lang, _ := t.tika.LanguageString(ctx, doc.Content); lang != "" {
doc.Content = stopwords.CleanString(doc.Content, lang, true)
if langCode, _ := t.tika.LanguageString(ctx, doc.Content); langCode != "" && t.cleanStopWords {
doc.Content = CleanString(doc.Content, langCode)
}

return doc, nil
Expand Down

0 comments on commit 8134151

Please sign in to comment.