forked from zeromicro/go-zero
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pread.go
139 lines (117 loc) · 2.56 KB
/
pread.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
package main
import (
"bufio"
"errors"
"flag"
"fmt"
"log"
"os"
"runtime"
"strconv"
"strings"
"time"
"github.com/tal-tech/go-zero/core/filex"
"github.com/tal-tech/go-zero/core/fx"
"github.com/tal-tech/go-zero/core/logx"
"gopkg.in/cheggaaa/pb.v1"
)
var (
file = flag.String("f", "", "the input file")
concurrent = flag.Int("c", runtime.NumCPU(), "concurrent goroutines")
wordVecDic TXDictionary
)
type (
Vector []float64
TXDictionary struct {
EmbeddingCount int64
Dim int64
Dict map[string]Vector
}
pair struct {
key string
vec Vector
}
)
func FastLoad(filename string) error {
if filename == "" {
return errors.New("no available dictionary")
}
now := time.Now()
defer func() {
logx.Infof("article2vec init dictionary end used %v", time.Since(now))
}()
dicFile, err := os.Open(filename)
if err != nil {
return err
}
defer dicFile.Close()
header, err := filex.FirstLine(filename)
if err != nil {
return err
}
total := strings.Split(header, " ")
wordVecDic.EmbeddingCount, err = strconv.ParseInt(total[0], 10, 64)
if err != nil {
return err
}
wordVecDic.Dim, err = strconv.ParseInt(total[1], 10, 64)
if err != nil {
return err
}
wordVecDic.Dict = make(map[string]Vector, wordVecDic.EmbeddingCount)
ranges, err := filex.SplitLineChunks(filename, *concurrent)
if err != nil {
return err
}
info, err := os.Stat(filename)
if err != nil {
return err
}
bar := pb.New64(info.Size()).SetUnits(pb.U_BYTES).Start()
fx.From(func(source chan<- interface{}) {
for _, each := range ranges {
source <- each
}
}).Walk(func(item interface{}, pipe chan<- interface{}) {
offsetRange := item.(filex.OffsetRange)
scanner := bufio.NewScanner(filex.NewRangeReader(dicFile, offsetRange.Start, offsetRange.Stop))
scanner.Buffer([]byte{}, 1<<20)
reader := filex.NewProgressScanner(scanner, bar)
if offsetRange.Start == 0 {
// skip header
reader.Scan()
}
for reader.Scan() {
text := reader.Text()
elements := strings.Split(text, " ")
vec := make(Vector, wordVecDic.Dim)
for i, ele := range elements {
if i == 0 {
continue
}
v, err := strconv.ParseFloat(ele, 64)
if err != nil {
return
}
vec[i-1] = v
}
pipe <- pair{
key: elements[0],
vec: vec,
}
}
}).ForEach(func(item interface{}) {
p := item.(pair)
wordVecDic.Dict[p.key] = p.vec
})
return nil
}
func main() {
flag.Parse()
start := time.Now()
if err := FastLoad(*file); err != nil {
log.Fatal(err)
}
fmt.Println(len(wordVecDic.Dict))
fmt.Println(time.Since(start))
}