-
Notifications
You must be signed in to change notification settings - Fork 7
/
main.go
102 lines (95 loc) · 2.72 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
// Sniff out DOI from JSON document, optionally update docs with found DOI.
package main
import (
"flag"
"fmt"
"log"
"os"
"regexp"
"runtime"
"strings"
"github.com/miku/span/doi"
)
var (
Version string
Buildtime string
noSkipUnmatched = flag.Bool("S", false, "do not skip unmatched documents")
updateKey = flag.String("k", "doi_str_mv", "update key")
forceOverwrite = flag.Bool("f", false, "force update, even if updateKey field exists")
identifierKey = flag.String("i", "id", "identifier key")
ignoreKeys = flag.String("K", "barcode,dewey", "ignore keys (regexp), comma separated") // TODO: repeated flag
numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
batchSize = flag.Int("b", 5000, "batch size")
showVersion = flag.Bool("version", false, "show version and exit")
)
func main() {
flag.Parse()
if *showVersion {
fmt.Printf("makta %s %s\n", Version, Buildtime)
os.Exit(0)
}
ignore, err := stringToRegexpSlice(*ignoreKeys, ",")
if err != nil {
log.Fatal(err)
}
sniffer := &doi.Sniffer{
Reader: os.Stdin,
Writer: os.Stdout,
SkipUnmatched: !*noSkipUnmatched,
UpdateKey: *updateKey,
IdentifierKey: *identifierKey,
MapSniffer: &doi.MapSniffer{
Pattern: regexp.MustCompile(doi.PatDOI),
IgnoreKeys: ignore,
},
// Custom postprocessing, cannot be changed from flags.
PostProcess: func(s string) string {
s = strings.TrimSpace(s)
switch {
case strings.HasSuffix(s, "])"):
// ai-179-z4p6s 10.24072/pci.ecology.100076])
return s[:len(s)-2]
case strings.HasSuffix(s, "/epdf"):
return s[:len(s)-5]
case strings.HasSuffix(s, ")") && !strings.Contains(s, "("):
// ai-179-wynjb 10.1016/j.jenvp.2019.01.011)
return s[:len(s)-1]
case strings.HasSuffix(s, "]") && !strings.Contains(s, "["):
// ai-28-29f64b012591451f83832a41c64bed83 10.5329/RECADM.20090802005]
return s[:len(s)-1]
case hasAnySuffix(s, []string{".", ",", ":", "*", `”`, "'"}):
return s[:len(s)-1]
default:
return s
}
},
NumWorkers: *numWorkers,
BatchSize: *batchSize,
}
if err := sniffer.Run(); err != nil {
log.Fatal(err)
}
}
// hasAnySuffix returns true, if s has any one of the given suffixes.
func hasAnySuffix(s string, suffixes []string) bool {
for _, suffix := range suffixes {
if strings.HasSuffix(s, suffix) {
return true
}
}
return false
}
// stringToRegexpSlice converts a string into a list of compiled patterns.
func stringToRegexpSlice(s string, sep string) (result []*regexp.Regexp, err error) {
if len(s) == 0 {
return
}
for _, v := range strings.Split(s, sep) {
re, err := regexp.Compile(v)
if err != nil {
return nil, err
}
result = append(result, re)
}
return result, nil
}