/
ttr.go
132 lines (105 loc) · 2.41 KB
/
ttr.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
// http://www3.nd.edu/~tweninge/pubs/WH_TIR08.pdf
package libs
import (
"math"
"regexp"
"strings"
)
type TTR struct {
Url, title, text, RawContent, cleaned string
charset string
ratio map[int]float64
sd float64 // standard deviation
}
// run algorithm
func (t *TTR) RunAlg() {
t.preprocess()
t.countTextToTagRatio()
}
// remove scripts, stylesheets, input, and image
func (t *TTR) preprocess() {
content, err := HTTP.Get(t.Url)
if err != nil {
panic(err)
}
// copy raw html
t.RawContent = content
// clean up html, remove style script and input tags
t.cleaned = CleanUpHtml(content)
// get charset
t.charset = GetHtmlCharset(t.cleaned)
}
// count text to tag ratio
func (t *TTR) countTextToTagRatio() {
// line no : ratio
var tagratio = make(map[int]float64)
lines := strings.Split(t.cleaned, "\n")
for i, line := range lines {
// get all chars in angle brackets
re, _ := regexp.Compile("\\<[\\S\\s]+?\\>")
matched := re.FindAllString(line, -1)
// if no html tags found, ratio[i] = len(line)
if matched == nil {
tagratio[i] = float64(len(line))
continue
}
// number of tags
tags := len(matched)
tagchars := 0
for _, tt := range matched {
tagchars += len(tt)
}
// number of non tags chars
nontags := len(line) - tagchars
// compute text/tag ratio
tagratio[i] = float64(nontags) / float64(tags)
}
t.ratio = tagratio
// adjust ratio
radius := 2
for i, _ := range t.ratio {
// start from 3
if i <= radius {
continue
}
if i+radius > len(t.ratio) {
break
}
// adjust ratio value, [i-radius, i+radius]
var sum float64
for j := i - radius; j < i+radius; j++ {
sum += t.ratio[j]
}
t.ratio[i] = sum / (2.0*float64(radius) + 1.0)
}
var avg, sum float64
for _, i := range t.ratio {
sum += i
}
avg = sum / float64(len(t.ratio))
sum = 0
for _, i := range t.ratio {
sum += math.Pow(i-avg, 2)
}
t.sd = math.Sqrt(sum / float64(len(t.ratio)-1))
for i, m := range t.ratio {
if m >= t.sd {
t.text += lines[i]
}
}
}
// return text
func (t *TTR) Text() string {
return ConvertToUtf8(t.text, t.charset)
}
// return title
func (t *TTR) Title() string {
title := RetrieveTitleFromHtml(t.cleaned)
return ConvertToUtf8(title, t.charset)
}
func NewTtr(url string) OupengAlg {
return &TTR{Url: url}
}
func init() {
RegisterAlg("ttr", NewTtr)
}