This repository has been archived by the owner on Feb 22, 2020. It is now read-only.
/
filter.go
396 lines (317 loc) · 9.19 KB
/
filter.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
// Copyright 2017 Martin Planer. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package felix
import (
"bytes"
"fmt"
"net/url"
"regexp"
"strings"
"unicode"
"path/filepath"
"context"
"time"
"io"
"io/ioutil"
"github.com/pkg/errors"
)
// Stringer is an optional interface for all filters to provide a 'native' textual representation.
type Stringer interface {
String() string
}
// FilterString return the concatenated string output of all passed filters that implement the Stringer interface.
func FilterString(itemFilters []ItemFilter, linkFilters []LinkFilter) string {
var b bytes.Buffer
for _, f := range itemFilters {
if s, ok := f.(Stringer); ok {
b.WriteString(s.String())
}
}
for _, f := range linkFilters {
if s, ok := f.(Stringer); ok {
b.WriteString(s.String())
}
}
return b.String()
}
// ItemFilter wraps the Filter method for items.
//
// Filter evaluates the given item, optionally modifies it, and passes it
// to the next filter in the filter chain, if it matches the filter criteria.
type ItemFilter interface {
Filter(item Item, next func(Item))
}
// ItemFilterFunc is an adapter to allow the use of ordinary functions as filters.
// If f is a function with the appropriate signature, ItemFilterFunc(f) is a ItemFilter that calls f.
type ItemFilterFunc func(Item, func(Item))
// Filter calls the underlying ItemFilterFunc
func (f ItemFilterFunc) Filter(item Item, next func(Item)) {
f(item, next)
}
// internal helper type to provide an additional Stringer implementation
type itemFilter struct {
ItemFilter
s string
}
func (f itemFilter) String() string {
return f.s
}
// FilterItems should just filter until in-Channel is closed? Or is quit channel needed?
func FilterItems(in <-chan Item, out chan<- Item, filters ...ItemFilter) {
// final filter chain output
var final = func(item Item) {
out <- item
}
chain := buildItemFilterChain(filters...)
for item := range in {
chain.Filter(item, final)
}
close(out)
}
func buildItemFilterChain(filters ...ItemFilter) ItemFilter {
if len(filters) > 1 {
return ItemFilterFunc(func(item Item, next func(Item)) {
filters[0].Filter(item, func(item Item) {
buildItemFilterChain(filters[1:]...).Filter(item, next)
})
})
} else if len(filters) == 1 {
return filters[0]
} else {
return ItemFilterFunc(func(item Item, next func(Item)) {
next(item)
})
}
}
// ItemTitleFilter filters items based on the given title strings.
// (After conversion to lower case and stripping of all non-alphanumeric characters)
func ItemTitleFilter(titles ...string) ItemFilter {
validTitles := make([][]string, 0, len(titles))
var b bytes.Buffer
for _, t := range titles {
validTitles = append(validTitles, strings.Split(sanitizeTitle(t), " "))
fmt.Fprintf(&b, "ITEM_TITLE:%s\n", t)
}
filter := ItemFilterFunc(func(item Item, next func(Item)) {
itemTitle := sanitizeTitle(item.Title)
for _, title := range validTitles {
found := true
for _, tf := range title {
if !strings.Contains(itemTitle, tf) {
found = false
}
}
if found {
next(item)
return
}
}
})
return itemFilter{filter, b.String()}
}
// sanitizeTitle strips all non-alphanumeric characters from a string
// and converts it to lower case for easier comparison.
func sanitizeTitle(title string) string {
t := make([]rune, 0, len(title))
emitted := false
skipped := false
for _, r := range title {
if unicode.IsDigit(r) || unicode.IsLetter(r) {
if skipped && emitted {
t = append(t, ' ')
}
t = append(t, unicode.ToLower(r))
emitted = true
skipped = false
} else {
skipped = true
}
}
return string(t)
}
// LinkFilter wraps the Filter method for links.
//
// Filter evaluates the given link, optionally modifies it, and passes it
// to the next filter in the filter chain, if it matches the filter criteria.
type LinkFilter interface {
Filter(link Link, next func(Link))
}
// LinkFilterFunc is an adapter to allow the use of ordinary functions as filters.
// If f is a function with the appropriate signature, LinkFilterFunc(f) is a LinkFilter that calls f.
type LinkFilterFunc func(Link, func(Link))
// Filter calls the underlying LinkFilterFunc and implements LinkFilter.
func (f LinkFilterFunc) Filter(link Link, next func(Link)) {
f(link, next)
}
// FilterLinks should just filter until in-Channel is closed? Or is quit channel needed?
func FilterLinks(in <-chan Link, out chan<- Link, filters ...LinkFilter) {
// final filter chain output
var final = func(link Link) {
out <- link
}
chain := buildLinkFilterChain(filters...)
for link := range in {
chain.Filter(link, final)
}
close(out)
}
func buildLinkFilterChain(filters ...LinkFilter) LinkFilter {
if len(filters) > 1 {
return LinkFilterFunc(func(link Link, next func(Link)) {
filters[0].Filter(link, func(link Link) {
buildLinkFilterChain(filters[1:]...).Filter(link, next)
})
})
} else if len(filters) == 1 {
return filters[0]
} else {
return LinkFilterFunc(func(link Link, next func(Link)) {
next(link)
})
}
}
// LinkDuplicatesFilter filters duplicate links based on the link URL.
// The links URLs are compared over a sliding window of the given size.
func LinkDuplicatesFilter(size int) LinkFilter {
if size <= 0 {
size = 1
}
foundURLs := make(map[string]bool)
var seqURLs []string
return LinkFilterFunc(func(link Link, next func(Link)) {
if foundURLs[link.URL] {
return
}
foundURLs[link.URL] = true
seqURLs = append(seqURLs, link.URL)
for len(seqURLs) > size {
u := seqURLs[0]
delete(foundURLs, u)
seqURLs = seqURLs[1:] // TODO: could reslicing lead to memory problems over time?
}
next(link)
})
}
// LinkDomainFilter filters links based on the given domains.
func LinkDomainFilter(domains ...string) LinkFilter {
validDomains := make([]string, 0, len(domains))
for _, domain := range domains {
validDomains = append(validDomains, strings.ToLower(strings.TrimSpace(domain)))
}
return LinkFilterFunc(func(link Link, next func(Link)) {
u, err := url.Parse(link.URL)
if err != nil {
return
}
hostname := strings.ToLower(u.Hostname())
for _, domain := range validDomains {
if domain == hostname {
next(link)
return
}
}
})
}
// LinkURLRegexFilter filters links based their URLs matching the given regular expressions.
func LinkURLRegexFilter(exprs ...string) (LinkFilter, error) {
var regexes []*regexp.Regexp
for _, expr := range exprs {
regex, err := regexp.Compile(expr)
if err != nil {
return nil, errors.Wrap(err, "could not compile regular expression")
}
regexes = append(regexes, regex)
}
return LinkFilterFunc(func(link Link, next func(Link)) {
for _, expr := range regexes {
if expr.MatchString(strings.TrimSpace(link.URL)) {
next(link)
break
}
}
}), nil
}
// LinkFilenameAsTitleFilter extracts the filename from the URL and sets it as the new link title.
// When trimExt is set, the filter tries to remove the file extension, if one is present.
func LinkFilenameAsTitleFilter(trimExt bool) LinkFilter {
return LinkFilterFunc(func(link Link, next func(Link)) {
u, err := url.Parse(strings.TrimSpace(link.URL))
if err != nil {
next(link)
return
}
if strings.HasSuffix(u.Path, "/") {
next(link)
return
}
filename := filepath.Base(u.Path)
if filename == "." || strings.Contains(filename, "/") {
next(link)
return
}
if trimExt {
filename = strings.TrimSuffix(filename, filepath.Ext(filename))
}
link.Title = filename
next(link)
})
}
// LinkUploadedExpandFilenameFilter expands the filename from an uploaded file URL and sets the appropriate new URL,
// e.g. uploaded.net/file/xxxxxxxx -> uploaded.net/file/xxxxxxxx/file.ext.
// This is sometime needed for easier filtering down the filter chain.
func LinkUploadedExpandFilenameFilter(source Source) LinkFilter {
return LinkFilterFunc(func(link Link, next func(Link)) {
u, err := url.Parse(strings.TrimSpace(link.URL))
// TODO: accept or reject non-parsable URLs?
if err != nil {
return
}
// Only process "uploaded" domains
if u.Hostname() != "ul.to" && u.Hostname() != "uploaded.net" {
next(link)
return
}
pathSegments := strings.Split(strings.Trim(u.Path, "/"), "/")
// Only process short form file URLs
if len(pathSegments) < 1 || len(pathSegments) > 2 {
next(link)
return
}
var id string
if len(pathSegments) == 1 {
id = pathSegments[0]
} else if len(pathSegments) == 2 && pathSegments[0] == "file" {
id = pathSegments[1]
} else {
next(link)
return
}
statusURL := fmt.Sprintf("%s://%s/file/%s/status", u.Scheme, u.Hostname(), id)
// TODO: pass context to filters?
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
reader, err := source.Get(ctx, statusURL)
if err != nil {
// fetch failed, skip link
return
}
filename := parseULFilename(reader)
if filename == "" {
return
}
link.URL = fmt.Sprintf("%s://%s/file/%s/%s", u.Scheme, u.Hostname(), id, filename)
next(link)
})
}
func parseULFilename(r io.Reader) string {
s, err := ioutil.ReadAll(r)
if err != nil {
return ""
}
split := strings.Split(string(s), "\n")
if len(split) < 1 {
return ""
}
return split[0]
}