This repository has been archived by the owner on Jul 18, 2018. It is now read-only.
/
html_to_text.go
123 lines (99 loc) · 3.78 KB
/
html_to_text.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
/*
Copyright 2016 Medcl (m AT medcl.net)
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package pipe
import (
log "github.com/cihub/seelog"
"github.com/medcl/gopa/core/model"
. "github.com/medcl/gopa/core/pipeline"
"github.com/medcl/gopa/core/util"
"regexp"
"strings"
)
const HtmlToText JointKey = "html2text"
type HtmlToTextJoint struct {
MergeWhitespace bool //merge whitespace and \n
}
func (this HtmlToTextJoint) Name() string {
return string(HtmlToText)
}
func (this HtmlToTextJoint) Process(context *Context) error {
//TODO all configable
snapshot := context.MustGet(CONTEXT_CRAWLER_SNAPSHOT).(*model.Snapshot)
body := snapshot.Payload
src := string(body)
//将HTML标签全转换成小写
re, _ := regexp.Compile("\\<[\\S\\s]+?\\>")
src = re.ReplaceAllStringFunc(src, strings.ToLower)
//去除STYLE
re, _ = regexp.Compile("\\<style[\\S\\s]+?\\</style\\>")
src = re.ReplaceAllString(src, "")
//去除META
re, _ = regexp.Compile("\\<meta[\\S\\s]+?\\</meta\\>")
src = re.ReplaceAllString(src, "")
//去除注释
re, _ = regexp.Compile("<!--[\\S\\s]*?-->")
src = re.ReplaceAllString(src, "")
//去除SCRIPT,NOSCRIPT
re, _ = regexp.Compile("\\<script[\\S\\s]+?\\</script\\>")
src = re.ReplaceAllString(src, "")
re, _ = regexp.Compile("\\<noscript[\\S\\s]+?\\</noscript\\>")
src = re.ReplaceAllString(src, "")
//去除iframe,frame
re, _ = regexp.Compile("\\<iframe[\\S\\s]+?\\</iframe\\>")
src = re.ReplaceAllString(src, "")
re, _ = regexp.Compile("\\<frame[\\S\\s]+?\\</frame\\>")
src = re.ReplaceAllString(src, "")
re, _ = regexp.Compile("\\<frameset[\\S\\s]+?\\</frameset\\>")
src = re.ReplaceAllString(src, "")
re, _ = regexp.Compile("\\<noframes[\\S\\s]+?\\</noframes\\>")
src = re.ReplaceAllString(src, "")
//remove embed objects
re, _ = regexp.Compile("\\<noembed[\\S\\s]+?\\</noembed\\>")
src = re.ReplaceAllString(src, "")
re, _ = regexp.Compile("\\<embed[\\S\\s]+?\\</embed\\>")
src = re.ReplaceAllString(src, "")
re, _ = regexp.Compile("\\<applet[\\S\\s]+?\\</applet\\>")
src = re.ReplaceAllString(src, "")
re, _ = regexp.Compile("\\<object[\\S\\s]+?\\</object\\>")
src = re.ReplaceAllString(src, "")
re, _ = regexp.Compile("\\<base[\\S\\s]+?\\</base\\>")
src = re.ReplaceAllString(src, "")
//remove code blocks
re, _ = regexp.Compile("\\<pre[\\S\\s]+?\\</pre\\>")
src = re.ReplaceAllString(src, "")
re, _ = regexp.Compile("\\<code[\\S\\s]+?\\</code\\>")
src = re.ReplaceAllString(src, "")
//去除所有尖括号内的HTML代码,并换成换行符
re, _ = regexp.Compile("\\<[\\S\\s]+?\\>")
src = re.ReplaceAllString(src, "\n")
//去除连续的换行符
re, _ = regexp.Compile("\\s{2,}")
src = re.ReplaceAllString(src, "\n")
if this.MergeWhitespace {
src = util.MergeSpace(src)
}
src = strings.Replace(src, "‘", "'", -1)
src = strings.Replace(src, "’", "'", -1)
src = strings.Replace(src, "“", "\"", -1)
src = strings.Replace(src, "”", "\"", -1)
src = strings.Replace(src, " ", " ", -1)
src = strings.Replace(src, """, "\"", -1)
src = strings.Replace(src, "'", "'", -1)
src = strings.Replace(src, """, "\"", -1)
src = strings.Replace(src, "'", "'", -1)
src = strings.Replace(src, "& ", "& ", -1)
src = strings.Replace(src, "&amp; ", "& ", -1)
log.Trace("get text: ", src)
snapshot.Text = util.XSSHandle(src)
return nil
}