forked from apache/lucene
-
Notifications
You must be signed in to change notification settings - Fork 0
/
TokenInfoDictionary.java
131 lines (117 loc) · 4.67 KB
/
TokenInfoDictionary.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja.dict;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.lucene.analysis.morph.BinaryDictionary;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.util.IOSupplier;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
/**
* Binary dictionary implementation for a known-word dictionary model: Words are encoded into an FST
* mapping to a list of wordIDs.
*/
public final class TokenInfoDictionary extends BinaryDictionary<TokenInfoMorphData> {
public static final String FST_FILENAME_SUFFIX = "$fst.dat";
private final TokenInfoFST fst;
private final TokenInfoMorphData morphAtts;
/**
* Create a {@link TokenInfoDictionary} from an external resource path.
*
* @param targetMapFile where to load target map resource
* @param posDictFile where to load POS dictionary resource
* @param dictFile where to load dictionary entries resource
* @param fstFile where to load encoded FST data resource
* @throws IOException if resource was not found or broken
*/
public TokenInfoDictionary(Path targetMapFile, Path posDictFile, Path dictFile, Path fstFile)
throws IOException {
this(
() -> Files.newInputStream(targetMapFile),
() -> Files.newInputStream(posDictFile),
() -> Files.newInputStream(dictFile),
() -> Files.newInputStream(fstFile));
}
private TokenInfoDictionary() throws IOException {
this(
() -> getClassResource(TARGETMAP_FILENAME_SUFFIX),
() -> getClassResource(POSDICT_FILENAME_SUFFIX),
() -> getClassResource(DICT_FILENAME_SUFFIX),
() -> getClassResource(FST_FILENAME_SUFFIX));
}
/**
* Create a {@link ConnectionCosts} from an input stream supplier.
*
* @param targetMapResource supplies a stream where the target map can be read
* @param posResource supplies a stream where the pos resource can be read
* @param dictResource supplies a stream where the dictionary can be read
* @param fstResource supplies a stream where the FST can be read
* @throws IOException if the supplied stream could not be read
*/
public TokenInfoDictionary(
IOSupplier<InputStream> targetMapResource,
IOSupplier<InputStream> posResource,
IOSupplier<InputStream> dictResource,
IOSupplier<InputStream> fstResource)
throws IOException {
super(
targetMapResource,
dictResource,
DictionaryConstants.TARGETMAP_HEADER,
DictionaryConstants.DICT_HEADER,
DictionaryConstants.VERSION);
this.morphAtts = new TokenInfoMorphData(buffer, posResource);
FST<Long> fst;
try (InputStream is = new BufferedInputStream(fstResource.get())) {
DataInput in = new InputStreamDataInput(is);
fst = new FST<>(in, in, PositiveIntOutputs.getSingleton());
}
// TODO: some way to configure?
this.fst = new TokenInfoFST(fst, true);
}
private static InputStream getClassResource(String suffix) throws IOException {
final String resourcePath = TokenInfoDictionary.class.getSimpleName() + suffix;
return IOUtils.requireResourceNonNull(
TokenInfoDictionary.class.getResourceAsStream(resourcePath), resourcePath);
}
@Override
public TokenInfoMorphData getMorphAttributes() {
return morphAtts;
}
public TokenInfoFST getFST() {
return fst;
}
public static TokenInfoDictionary getInstance() {
return SingletonHolder.INSTANCE;
}
private static class SingletonHolder {
static final TokenInfoDictionary INSTANCE;
static {
try {
INSTANCE = new TokenInfoDictionary();
} catch (IOException ioe) {
throw new RuntimeException("Cannot load TokenInfoDictionary.", ioe);
}
}
}
}