/
tokenize.rb
79 lines (68 loc) · 2.43 KB
/
tokenize.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/bin/env ruby
# mapper
require 'rubygems'
require 'stanford-core-nlp'
require 'uuidtools'
jar_path = File.expand_path('./stanford-core-nlp/')+'/'
StanfordCoreNLP.jvm_args = ['-Xmx3g']
StanfordCoreNLP.jar_path = jar_path
StanfordCoreNLP.model_path = jar_path
StanfordCoreNLP.use(:english)
class TextTokenizer
@@pipeline = StanfordCoreNLP.load(:tokenize, :ssplit, :pos, :lemma, :parse)
def tokenize(paragraph)
text = StanfordCoreNLP::Text.new(paragraph)
@@pipeline.annotate(text)
sentences = []
text.get(:sentences).each do |sentence|
sentence_begin = sentence.get(:character_offset_begin).to_s.to_i
sentence_end = sentence.get(:character_offset_end).to_s.to_i
sentence_text = paragraph[sentence_begin...sentence_end]
sentence_data = {
:sentence_id => UUIDTools::UUID.timestamp_create,
:original_text => sentence_text,
:tokens => []
}
index = 0
words = Hash.new(0)
sentence.get(:tokens).each do |token|
base_form = token.get(:lemma).to_s
token_begin = token.get(:character_offset_begin).to_s.to_i - sentence_begin
token_end = token.get(:character_offset_end).to_s.to_i - sentence_begin
words[base_form] += 1
base_form.downcase! if /^[A-Z][a-z]+$/.match(base_form)
if /^[a-z]{2,24}$/i.match(base_form)
token_data = {
:token_id => UUIDTools::UUID.timestamp_create,
:original_text => token.get(:original_text).to_s,
:base_form => base_form,
:part_of_speech => token.get(:part_of_speech).to_s,
:index => index,
:count => words[base_form],
:token_begin => token_begin,
:token_length => token_end - token_begin
}
sentence_data[:tokens] << token_data
index += 1
end
end
sentences << sentence_data
end
sentences
end
end
if $0 == __FILE__
tokenizer = TextTokenizer.new
STDIN.each do |line|
doc_id, paragraph_id, text = line.split("\t", 3)
tokenizer.tokenize(text).each do |sentence|
puts ["sentence", doc_id, sentence[:sentence_id], sentence[:original_text]].join("\t")
sentence[:tokens].each do |token|
puts (["token", doc_id] +
[:sentence_id, :token_id, :base_form, :part_of_speech, :index, :count, :token_begin, :token_length].map { |key|
token[key]
}).join("\t")
end
end
end
end