-
Notifications
You must be signed in to change notification settings - Fork 7
/
main.go
56 lines (47 loc) · 1.33 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
// The span-local-data extracts data from a JSON file - something `jq` can do
// just as well, albeit a bit slower.
package main
import (
"bufio"
"bytes"
"encoding/json"
"flag"
"fmt"
"io"
"os"
"strings"
log "github.com/sirupsen/logrus"
"github.com/miku/span/parallel"
)
// record is a subset of the intermediate schema fields.
type record struct {
ID string `json:"finc.id,omitempty"`
SourceID string `json:"finc.source_id,omitempty"`
DOI string `json:"doi,omitempty"`
Labels []string `json:"x.labels,omitempty"`
}
// WriteFields writes a variable number of fields as tab separated values into a writer.
func WriteFields(w io.Writer, values []string) (int, error) {
return io.WriteString(w, fmt.Sprintf("%s\n", strings.Join(values, ",")))
}
func main() {
batchsize := flag.Int("b", 25000, "batch size")
flag.Parse()
bw := bufio.NewWriter(os.Stdout)
defer bw.Flush()
p := parallel.NewProcessor(os.Stdin, os.Stdout, func(_ int64, b []byte) ([]byte, error) {
var doc record
if err := json.Unmarshal(b, &doc); err != nil {
return nil, err
}
var buf bytes.Buffer
if _, err := WriteFields(&buf, append([]string{doc.ID, doc.SourceID, doc.DOI}, doc.Labels...)); err != nil {
return nil, err
}
return buf.Bytes(), nil
})
p.BatchSize = *batchsize
if err := p.Run(); err != nil {
log.Fatal(err)
}
}