/
loader.rb
133 lines (126 loc) · 3.8 KB
/
loader.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# class Aozorasearch::Loader
#
# Copyright (C) 2016 Masafumi Yokoyama <myokoym@gmail.com>
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
require "csv"
require "nkf"
require "nokogiri"
require "parallel"
require "zip"
require "aozorasearch/groonga_database"
require "aozorasearch/book"
module Aozorasearch
class Loader
def load(options={})
books = []
Zip::File.open("aozorabunko/index_pages/list_person_all_extended_utf8.zip") do |zip_file|
entry = zip_file.glob("*.csv").first
authors_csv = entry.get_input_stream.read
authors_csv.force_encoding(Encoding::UTF_8)
CSV.new(authors_csv,
headers: true,
converters: nil).each do |row|
books << Book.new(row)
end
end
load_proc = lambda do |book|
load_book(book, options)
end
if options[:parallel]
Parallel.each(books, &load_proc)
else
books.each(&load_proc)
end
end
private
def load_book(book, options={})
if options[:diff]
updated_date = [book.published_date, book.updated_date].max
return if updated_date < options[:diff]
end
author = Groonga["Authors"][book.author_id]
unless author
author = Groonga["Authors"].add(
book.author_id,
name: book.author_name
)
end
path = book.html_url.scan(/\/cards\/.*/).first
return unless path
puts "#{book.name} - #{book.author_name}"
html = File.read(File.join("aozorabunko", path))
encoding = NKF.guess(html).to_s
doc = Nokogiri::HTML.parse(html, nil, encoding)
title = book.title
unless book.subtitle.empty?
title += " #{book.subtitle}"
end
content = ""
main_text_nodes = doc.search("body .main_text").children
if main_text_nodes.empty?
main_text_nodes = doc.search("body").children
end
main_text_nodes.each do |node|
case node.node_name
when "text", "div"
content += node.text
when "ruby"
rb = node.at_xpath('.//rb')
if rb
content += rb.text
end
end
end
unless book.author_birthdate.empty?
age = book.author_birthdate.split(/-/).first
if /\A\d{1,3}\z/ =~ age
age = sprintf("%04d", age)
end
if age
age_group = age.sub(/\d\z/, "0")
elsif /紀元前/ =~ book.author_birthdate
age_group = "紀元前"
else
age_group = "????"
end
else
age_group = "????"
end
if Groonga["Books"][book.id]
authors = Groonga["Books"][book.id].authors
else
authors = []
end
authors << author
Groonga["Books"].add(
book.id,
title: title,
content: content,
authors: authors.uniq,
card_url: book.card_url,
html_url: book.html_url,
orthography: book.orthography,
copyrighted: book.copyrighted,
ndc: book.ndc,
ndc1: book.ndc1,
ndc2: book.ndc2,
ndc3: book.ndc3,
age_group: age_group,
kids: book.kids?,
)
end
end
end