This repository has been archived by the owner on Oct 3, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
training.go
89 lines (73 loc) · 1.79 KB
/
training.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
package main
import (
"analyzer"
"fileutil"
"fmt"
"github.com/parkghost/bayesian"
"goseg"
"io/ioutil"
"log"
"mailfile"
"mailpost"
"path/filepath"
"time"
)
var (
Good bayesian.Class = "Good"
Bad bayesian.Class = "Bad"
cutset = "1234567890:;=<>"
)
var (
dictDataFilePath = "dict.data"
output = "bayesian.data"
)
var trainingData = []struct {
folder string
class bayesian.Class
}{
{filepath.Join("data", "training", "good"), Good},
{filepath.Join("data", "training", "bad"), Bad},
}
func main() {
classifier := bayesian.NewClassifier(Good, Bad)
tokenizer, err := goseg.NewTokenizerFromFile(dictDataFilePath)
if err != nil {
log.Fatal(err)
}
for _, item := range trainingData {
log.Printf("Traning %s", item.folder)
totalNum := 0
var totalSize int64
startTime := time.Now()
fis, err := ioutil.ReadDir(item.folder)
if err != nil {
log.Fatal(err)
}
for _, fi := range fis {
if fi.IsDir() {
continue
}
totalSize += fi.Size()
filePath := filepath.Join(item.folder, fi.Name())
mail := mailfile.NewPOP3Mail(filePath)
if err = mail.Parse(); err != nil {
log.Fatal(err)
}
post, err := mailpost.Parse(mail)
mail.Close()
if err != nil {
log.Fatalf("Err: %v, Mail:%s", err, mail.Path())
}
words := analyzer.Normalize(tokenizer.Cut([]rune(post.Subject+" "+post.Content)), cutset)
classifier.Learn(words, item.class)
totalNum += 1
}
elapsed := time.Now().Sub(startTime)
fmt.Printf("TotalNum: %d\n", totalNum)
fmt.Printf("Elapsed: %s, TPS(Mail): %f, TPS(FileSize): %s\n",
time.Now().Sub(startTime),
float64(totalNum)/(float64(elapsed)/float64(time.Second)),
fileutil.Humanize(uint64(float64(totalSize)/(float64(elapsed)/float64(time.Second)))))
}
classifier.WriteToFile(output)
}