/
word_counter.rb
37 lines (30 loc) · 1.19 KB
/
word_counter.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# encoding=utf-8
require 'nokogiri'
class WordCounter
attr_accessor :text
def initialize(text)
@text = text
end
# only count actual text
# scan by word boundaries after stripping hyphens and apostrophes
# so one-word and one's will be counted as one word, not two.
# -- is replaced by — (emdash) before strip so one--two will count as 2
def count
count = 0
# avoid blank? so we don't need to load Rails for tests
return count if @text.nil? || @text.empty?
# Scripts such as Chinese and Japanese that do not have space between words
# are counted based on the number of characters. If a text include mixed
# languages, only characters in these languages would be counted as words,
# words in other languages are counted as usual
character_count_scripts = ArchiveConfig.CHARACTER_COUNT_SCRIPTS.map { |lang| "\\p{#{lang}}" }.join("|")
body = Nokogiri::HTML(@text).xpath('//body').first
body.traverse do |node|
if node.is_a? Nokogiri::XML::Text
count += node.inner_text.gsub(/--/, "—").gsub(/['’‘-]/, "")
.scan(/#{character_count_scripts}|((?!#{character_count_scripts})[[:word:]])+/).size
end
end
count
end
end