forked from habeanf/yap
-
Notifications
You must be signed in to change notification settings - Fork 21
/
hebma.go
102 lines (92 loc) · 3.31 KB
/
hebma.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
package app
import (
"yap/nlp/format/lattice"
"yap/nlp/format/lex"
"yap/nlp/format/raw"
"yap/nlp/parser/ma"
"yap/nlp/parser/xliter8"
nlp "yap/nlp/types"
// "yap/util"
"fmt"
"log"
// "os"
"github.com/gonuts/commander"
"github.com/gonuts/flag"
)
var (
prefixFile, lexiconFile string
xliter8out, alwaysnnp bool
nnpnofeats bool
showoov bool
)
func HebMAConfigOut() {
log.Println("Configuration")
log.Printf("Heb Lexicon:\t\t%s", prefixFile)
log.Printf("Heb Prefix:\t\t%s", lexiconFile)
log.Printf("OOV Strategy:\t%v", "Const:NNP")
log.Printf("xliter8 out:\t\t%v", xliter8out)
log.Println()
log.Printf("Raw Input:\t\t%s", inRawFile)
log.Printf("Output:\t\t%s", outLatticeFile)
log.Println()
}
func HebMA(cmd *commander.Command, args []string) error {
REQUIRED_FLAGS := []string{"prefix", "lexicon", "raw", "out"}
VerifyFlags(cmd, REQUIRED_FLAGS)
HebMAConfigOut()
maData := new(ma.BGULex)
log.Println("Reading Morphological Analyzer BGU Prefixes")
maData.LoadPrefixes(prefixFile)
log.Println("Reading Morphological Analyzer BGU Lexicon")
maData.LoadLex(lexiconFile, nnpnofeats)
log.Println()
sents, err := raw.ReadFile(inRawFile, limit)
if err != nil {
panic(fmt.Sprintf("Failed reading raw file - %v", err))
}
log.Println("Running Hebrew Morphological Analysis")
lattices := make([]nlp.LatticeSentence, len(sents))
stats := new(ma.AnalyzeStats)
stats.Init()
maData.Stats = stats
maData.AlwaysNNP = alwaysnnp
maData.LogOOV = showoov
prefix := log.Prefix()
for i, sent := range sents {
log.SetPrefix(fmt.Sprintf("%v graph# %v ", prefix, i))
lattices[i], _ = maData.Analyze(sent.Tokens())
}
log.SetPrefix(prefix)
log.Println("Analyzed", stats.TotalTokens, "occurences of", len(stats.UniqTokens), "unique tokens")
log.Println("Encountered", stats.OOVTokens, "occurences of", len(stats.UniqOOVTokens), "unknown tokens")
var hebrew xliter8.Interface
if xliter8out {
hebrew = &xliter8.Hebrew{}
}
output := lattice.Sentence2LatticeCorpus(lattices, hebrew)
lattice.WriteFile(outLatticeFile, output)
return nil
}
func HebMACmd() *commander.Command {
cmd := &commander.Command{
Run: HebMA,
UsageLine: "hebma <file options> [arguments]",
Short: "run lexicon-based morphological analyzer on raw input",
Long: `
run lexicon-based morphological analyzer on raw input
$ ./yap hebma -prefix <prefix file> -lexicon <lexicon file> -raw <raw file> -out <output file> [options]
`,
Flag: *flag.NewFlagSet("ma", flag.ExitOnError),
}
cmd.Flag.StringVar(&prefixFile, "prefix", "", "Prefix file for morphological analyzer")
cmd.Flag.StringVar(&lexiconFile, "lexicon", "", "Lexicon file for morphological analyzer")
cmd.Flag.StringVar(&inRawFile, "raw", "", "Input raw (tokenized) file")
cmd.Flag.StringVar(&outLatticeFile, "out", "", "Output lattice file")
cmd.Flag.BoolVar(&xliter8out, "xliter8out", false, "Transliterate output lattice file")
cmd.Flag.BoolVar(&alwaysnnp, "alwaysnnp", false, "Always add NNP to tokens and prefixed subtokens")
cmd.Flag.BoolVar(&nnpnofeats, "addnnpnofeats", false, "Add NNP in lex but without features")
cmd.Flag.IntVar(&limit, "limit", 0, "Limit input set")
cmd.Flag.BoolVar(&showoov, "showoov", false, "Output OOV tokens")
cmd.Flag.BoolVar(&lex.LOG_FAILURES, "showlexerror", false, "Log errors encountered when loading the lexicon")
return cmd
}