Skip to content

Commit

Permalink
HELIO-3870 Index TOCs for improved search
Browse files Browse the repository at this point in the history
  • Loading branch information
sethaj committed Apr 27, 2021
1 parent 97a6008 commit ca77c64
Show file tree
Hide file tree
Showing 6 changed files with 137 additions and 7 deletions.
12 changes: 12 additions & 0 deletions app/indexers/monograph_indexer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,21 @@ def generate_solr_document
# HELIO-3709 used in the NOID API
# Collapse whitespace in identifiers if they exist, although in practice HELIO-3712 fixes this
solr_doc[Solrizer.solr_name('identifier', :symbol)] = object.identifier.map { |id| id.gsub(/\s/, "") }

# Index the ToC of the monograph's epub or pdf_ebook if it has one, HELIO-3870
solr_doc[Solrizer.solr_name('table_of_contents', :stored_searchable)] = table_of_contents(object.id)
end
end

def table_of_contents(work_id)
# prefer the epub if there is one, otherwise next in order will be pdf_ebook
ebook_id = FeaturedRepresentative.where(work_id: work_id, kind: ['epub', 'pdf_ebook']).order(:kind).first&.file_set_id
return [] if ebook_id.nil?
toc = EbookTableOfContentsCache.where(noid: ebook_id).first&.toc
return [] if toc.nil?
JSON.parse(toc).map { |entry| entry["title"] }
end

def importable_names(field)
value = object.public_send(field).first
value.present? ? value.split(/\r?\n/).reject(&:blank?).join('; ') : value
Expand Down
2 changes: 2 additions & 0 deletions app/jobs/unpack_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def perform(id, kind) # rubocop:disable Metrics/CyclomaticComplexity, Metrics/P
unpack_epub(id, root_path, file)
create_search_index(root_path)
cache_epub_toc(id, root_path)
file_set.parent.update_index # index the ToC to the monograph
create_epub_chapters(id, root_path) # has to come after cache_epub_toc()
epub_webgl_bridge(id, root_path, kind)
when 'webgl'
Expand All @@ -52,6 +53,7 @@ def perform(id, kind) # rubocop:disable Metrics/CyclomaticComplexity, Metrics/P
pdf = linearize_pdf(root_path, file)
create_pdf_chapters(id, pdf, root_path) if File.exist? pdf
cache_pdf_toc(id, pdf) if File.exist? pdf
file_set.parent.update_index # index the ToC to the monograph
else
Rails.logger.error("Can't unpack #{kind} for #{id}")
end
Expand Down
4 changes: 2 additions & 2 deletions spec/factories/featured_representatives.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

FactoryBot.define do
factory :featured_representative do
work_id { "MyString" }
file_set_id { "MyString" }
work_id { Noid::Rails::Service.new.mint }
file_set_id { Noid::Rails::Service.new.mint }
kind { "epub" }
end
end
82 changes: 81 additions & 1 deletion spec/indexers/monograph_indexer_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,14 @@
identifier: ['bar_number:S0001', 'heb9999.0001.001'],
press: press.subdomain)
}
let(:file_set) { create(:file_set) }
let(:file_set) { create(:file_set, content: File.open(File.join(fixture_path, 'moby-dick.epub'))) }
let(:press_name) { press.name }
let(:parent_press) { nil }

before do
monograph.ordered_members << file_set
monograph.save!
file_set.save!
end

it 'indexes the ordered members' do
Expand Down Expand Up @@ -67,6 +68,17 @@
end
end

context "ebook representative table of contents" do
before do
create(:featured_representative, work_id: monograph.id, file_set_id: file_set.id, kind: "epub")
UnpackJob.perform_now(file_set.id, "epub") # to index the epub's table of contents HELIO-3870
end

it "indexes the epub/pdf_ebook's ToC if there is one" do
expect(subject['table_of_contents_tesim']).to include("Chapter 73. Stubb and Flask Kill a Right Whale; and Then Have a Talk")
end
end

it 'indexes the representative_id' do
expect(subject['representative_id_ssim']).to eq monograph.representative_id
end
Expand Down Expand Up @@ -205,4 +217,72 @@
end
end
end

describe "#table_of_contents" do
context "no epub or pdf_ebook" do
it "returns an empty list" do
expect(described_class.new(Monograph.new).table_of_contents('somenoid')).to eq []
end
end

context "an epub with no toc" do
let(:rep) { create(:featured_representative, kind: 'epub') }

it "returns an empty list" do
expect(described_class.new(Monograph.new).table_of_contents(rep.work_id)).to eq []
end
end

context "a pdf_ebook (no available epub) with no toc" do
let(:rep) { create(:featured_representative, kind: 'pdf_ebook') }

it "returns an empty list" do
expect(described_class.new(Monograph.new).table_of_contents(rep.work_id)).to eq []
end
end

context "an epub with a toc" do
let(:toc) {
[
{ title: "Front Cover", level: 1, cfi: "/OEBPS/Cover.xhtml" },
{ title: "Chapter 1: The Starting", level: 1, cfi: "/OEBPS/1.xhtml" },
{ title: "Chapter 2: The Ending", level: 1, cfi: "/OEBPS/2.xhtml" }
]
}
let(:rep) { create(:featured_representative, kind: 'epub') }

before do
EbookTableOfContentsCache.create(noid: rep.file_set_id, toc: toc.to_json)
end

it "returns the table of contents titles" do
expect(described_class.new(Monograph.new).table_of_contents(rep.work_id)).to eq ["Front Cover", "Chapter 1: The Starting", "Chapter 2: The Ending"]
end
end

context "an epub and a pdf_ebook with tocs" do
let(:monograph_noid) { "aa1234kl0" }
let(:epub_toc) {
[
{ title: "epub toc", level: 1, cfi: "/OEBPS/Cover.xhtml" }
]
}
let(:pdf_toc) {
[
{ title: "pdf toc", level: 1, cfi: "page=1" }
]
}
let(:pdf) { create(:featured_representative, work_id: monograph_noid, kind: 'pdf_ebook') }
let(:epub) { create(:featured_representative, work_id: monograph_noid, kind: 'epub') }

before do
EbookTableOfContentsCache.create(noid: pdf.file_set_id, toc: pdf_toc.to_json)
EbookTableOfContentsCache.create(noid: epub.file_set_id, toc: epub_toc.to_json)
end

it "returns the epub table of contents titles" do
expect(described_class.new(Monograph.new).table_of_contents(monograph_noid)).to eq ["epub toc"]
end
end
end
end
4 changes: 4 additions & 0 deletions spec/jobs/reindex_epub_job_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,14 @@

RSpec.describe ReindexEpubJob, type: :job do
describe "perform" do
let(:monograph) { create(:monograph) }
let(:epub) { create(:file_set, content: File.open(File.join(fixture_path, 'fake_epub01.epub'))) }
let(:db_file) { File.join(UnpackService.root_path_from_noid(epub.id, 'epub'), epub.id + '.db') }

before do
monograph.ordered_members << epub
monograph.save!
epub.save!
UnpackJob.perform_now(epub.id, 'epub')
end

Expand Down
40 changes: 36 additions & 4 deletions spec/jobs/unpack_job_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,17 @@
end

context "with a reflowable epub" do
let(:monograph) { create(:monograph) }
let(:reflowable_epub) { create(:file_set, content: File.open(File.join(fixture_path, 'fake_epub01.epub'))) }
let(:root_path) { UnpackService.root_path_from_noid(reflowable_epub.id, 'epub') }
let(:chapters_dir) { UnpackService.root_path_from_noid(reflowable_epub.id, 'epub_chapters') }

before do
monograph.ordered_members << reflowable_epub
monograph.save!
reflowable_epub.save!
end

it "unzips the epub, caches the ToC, creates the search database and doesn't make chapter files derivatives" do
described_class.perform_now(reflowable_epub.id, 'epub')
expect(JSON.parse(EbookTableOfContentsCache.find_by(noid: reflowable_epub.id).toc).length).to eq 3
Expand All @@ -32,10 +39,17 @@
end

context "with a fixed-layout epub" do
let(:monograph) { create(:monograph) }
let(:fixed_layout_epub) { create(:file_set, content: File.open(File.join(fixture_path, 'the-whale.epub'))) }
let(:root_path) { UnpackService.root_path_from_noid(fixed_layout_epub.id, 'epub') }
let(:chapters_dir) { UnpackService.root_path_from_noid(fixed_layout_epub.id, 'epub_chapters') }

before do
monograph.ordered_members << fixed_layout_epub
monograph.save!
fixed_layout_epub.save!
end

it "unzips the epub, caches the ToC, creates the search database and makes the chapter files derivatives" do
described_class.perform_now(fixed_layout_epub.id, 'epub')
expect(File.exist?(File.join(root_path, fixed_layout_epub.id + '.db'))).to be true
Expand Down Expand Up @@ -63,10 +77,17 @@
end

context "with a pdf_ebook" do
let(:monograph) { create(:monograph) }
let(:pdf_ebook) { create(:file_set, content: File.open(File.join(fixture_path, 'lorum_ipsum_toc.pdf'))) }
let(:root_path) { UnpackService.root_path_from_noid(pdf_ebook.id, 'pdf_ebook') }
let(:chapters_dir) { UnpackService.root_path_from_noid(pdf_ebook.id, 'pdf_ebook_chapters') }

before do
monograph.ordered_members << pdf_ebook
monograph.save!
pdf_ebook.save!
end

it "makes the pdf_ebook, caches the ToC and makes the chapter files derivatives" do
described_class.perform_now(pdf_ebook.id, 'pdf_ebook')
expect(File.exist?("#{root_path}.pdf")).to be true
Expand Down Expand Up @@ -98,11 +119,18 @@
end

context "with an epub and pre-existing webgl" do
let(:monograph) { create(:monograph) }
let(:epub) { create(:file_set, content: File.open(File.join(fixture_path, 'fake_epub01.epub'))) }
let!(:fre) { create(:featured_representative, work_id: 'mono_id', file_set_id: epub.id, kind: 'epub') }
let!(:frw) { create(:featured_representative, work_id: 'mono_id', file_set_id: '123456789', kind: 'webgl') }
let!(:fre) { create(:featured_representative, work_id: monograph.id, file_set_id: epub.id, kind: 'epub') }
let!(:frw) { create(:featured_representative, work_id: monograph.id, file_set_id: '123456789', kind: 'webgl') }
let(:root_path) { UnpackService.root_path_from_noid(epub.id, 'epub') }

before do
monograph.ordered_members << epub
monograph.save!
epub.save!
end

after { FeaturedRepresentative.destroy_all }

it "creates the epub-webgl map" do
Expand All @@ -112,9 +140,13 @@
end

context "with a pre-existing epub" do
let(:monograph) { create(:monograph) }
let(:epub) { create(:file_set, content: File.open(File.join(fixture_path, 'fake_epub01.epub'))) }

before do
monograph.ordered_members << epub
monograph.save!
epub.save!
# we need the epub already unpacked in order to store the epub-webgl map file
described_class.perform_now(epub.id, 'epub')
end
Expand All @@ -123,8 +155,8 @@

context "adding a webgl" do
let(:webgl) { create(:file_set, content: File.open(File.join(fixture_path, 'fake-game.zip'))) }
let!(:fre) { create(:featured_representative, work_id: 'mono_id', file_set_id: epub.id, kind: 'epub') }
let!(:frw) { create(:featured_representative, work_id: 'mono_id', file_set_id: webgl.id, kind: 'webgl') }
let!(:fre) { create(:featured_representative, work_id: monograph.id, file_set_id: epub.id, kind: 'epub') }
let!(:frw) { create(:featured_representative, work_id: monograph.id, file_set_id: webgl.id, kind: 'webgl') }
# The root_path of the epub, not the webgl is used to test
let(:root_path) { UnpackService.root_path_from_noid(epub.id, 'epub') }

Expand Down

0 comments on commit ca77c64

Please sign in to comment.