Skip to content

Commit

Permalink
Merge pull request #2422 from mlibrary/2809_pdf_toc
Browse files Browse the repository at this point in the history
Addresses HELIO-2809 Generate TOC from representative PDF's outline
  • Loading branch information
gkostin1966 committed Jul 16, 2019
2 parents a0897f5 + b85dc2f commit b64e3ec
Show file tree
Hide file tree
Showing 14 changed files with 379 additions and 2 deletions.
3 changes: 3 additions & 0 deletions Gemfile
Expand Up @@ -129,6 +129,9 @@ gem "loofah", ">= 2.2.3"
# Use MySQL as the database for Active Record
gem 'mysql2', '~> 0.4.10'

# Read PDF ToC
gem 'origami'

# Force epub search results to be sentences
gem 'pragmatic_segmenter', '~> 0.3'

Expand Down
4 changes: 4 additions & 0 deletions Gemfile.lock
Expand Up @@ -186,6 +186,7 @@ GEM
execjs
coffee-script-source (1.12.2)
colorator (1.1.0)
colorize (0.8.1)
combine_pdf (1.0.16)
ruby-rc4 (>= 0.1.5)
commonjs (0.2.7)
Expand Down Expand Up @@ -598,6 +599,8 @@ GEM
rack (>= 1.2, < 3)
openseadragon (0.5.0)
rails (> 3.2.0)
origami (2.1.0)
colorize (~> 0.7)
orm_adapter (0.5.0)
os (1.0.1)
parallel (1.12.1)
Expand Down Expand Up @@ -1004,6 +1007,7 @@ DEPENDENCIES
mysql2 (~> 0.4.10)
oauth
oauth2 (~> 1.2)
origami
pragmatic_segmenter (~> 0.3)
prawn (~> 2.2)
pry-rails (~> 0.3.4)
Expand Down
Expand Up @@ -97,10 +97,19 @@ def pdf_ebook?
featured_representatives.map(&:kind).include? 'pdf_ebook'
end

def pdf_ebook
ordered_member_docs.find { |doc| doc.id == pdf_ebook_id }
end

def pdf_ebook_id
featured_representatives.map { |fr| fr.file_set_id if fr.kind == 'pdf_ebook' }.compact.first
end

def pdf_ebook_presenter
entity = Sighrax.factory(pdf_ebook_id)
@pdf_ebook_presenter ||= PDFEbookPresenter.new(PDFEbook::Publication.from_string_id(entity.content, pdf_ebook_id))
end

def mobi?
featured_representatives.map(&:kind).include? 'mobi'
end
Expand Down
23 changes: 23 additions & 0 deletions app/presenters/pdf_ebook_presenter.rb
@@ -0,0 +1,23 @@
# frozen_string_literal: true

class PDFEbookPresenter < ApplicationPresenter
def initialize(pdf_ebook)
@pdf_ebook = pdf_ebook
end

def id
@pdf_ebook.id
end

def multi_rendition?
false
end

def intervals?
@pdf_ebook.intervals.count.positive?
end

def intervals
@pdf_ebook.intervals.map { |interval| EPubIntervalPresenter.new(interval) }
end
end
2 changes: 1 addition & 1 deletion app/views/monograph_catalog/_index_epub_toc.html.erb
@@ -1,6 +1,6 @@
<%
level = 0
epub_presenter = @monograph_presenter.epub_presenter
epub_presenter = @monograph_presenter.epub? ? @monograph_presenter.epub_presenter : @monograph_presenter.pdf_ebook_presenter
epub_policy = @monograph_policy.epub_policy
%>
<% epub_presenter.intervals.each do |interval| %>
Expand Down
2 changes: 1 addition & 1 deletion app/views/monograph_catalog/_index_monograph.html.erb
Expand Up @@ -125,7 +125,7 @@
<li role="presentation"><a id="tab-stats" href="#stats" role="tab" data-toggle="tab" aria-controls="stats" aria-selected="false" tabindex="-1">Stats</a></li>
</ul>
<div id="tabs-content" class="tab-content monograph-assets-toc-epub-content" aria-live="polite">
<% if @monograph_presenter.epub? %>
<% if @monograph_presenter.epub? || @monograph_presenter.pdf_ebook? %>
<section id="toc" class="tab-pane active fade in toc row" role="tabpanel" aria-hidden="false" aria-labelledby="tab-toc" tabindex="0">
<div class="col-sm-12">
<h2 class="sr-only">Table of Contents</h2>
Expand Down
48 changes: 48 additions & 0 deletions lib/pdf_ebook.rb
@@ -0,0 +1,48 @@
# frozen_string_literal: true

module PDFEbook
#
# Logger
#
require 'logger'
# mattr_accessor :logger
@logger = Logger.new(STDOUT)

def self.logger
@logger
end

def self.logger=(logger)
@logger = logger
end

#
# Configure
#
@configured = false

# spec helper
def self.reset_configured_flag
@configured = false
end

def self.configured?
@configured
end

def self.configure
@configured = true
yield self
end
end

#
# Require Dependencies
#
require 'origami'

#
# Require Relative
#
require_relative './pdf_ebook/interval'
require_relative './pdf_ebook/publication'
59 changes: 59 additions & 0 deletions lib/pdf_ebook/interval.rb
@@ -0,0 +1,59 @@
# frozen_string_literal: true

module PDFEbook
class Interval
private_class_method :new

# Class Methods
def self.from_title_level_cfi(title, level, cfi)
return null_object unless title&.instance_of?(String) && cfi&.instance_of?(String)
new(title: title, depth: level, cfi: cfi)
end

def self.null_object
IntervalNullObject.send(:new)
end

# Instance Methods

def title
@args[:title] || ''
end

def level
@args[:depth] || 0
end

def cfi
@args[:cfi] || ''
end

def downloadable?
false
end

def pages
[]
end

def downloadable_pages
[]
end

private

def initialize(args)
@args = args
end
end

class IntervalNullObject < Interval
private_class_method :new

private

def initialize
super({})
end
end
end
67 changes: 67 additions & 0 deletions lib/pdf_ebook/publication.rb
@@ -0,0 +1,67 @@
# frozen_string_literal: true

module PDFEbook
class Publication
private_class_method :new
attr_reader :id

# Class Methods
def self.from_string_id(string, id)
file = StringIO.new(string)
new(file, id)
rescue StandardError => e
::PDFEbook.logger.info("Publication.from_string_id(#{string[0..30]}) raised #{e} #{e.backtrace}")
nil
end

def self.from_path_id(path, id)
file = File.new(path)
new(file, id)
rescue StandardError => e
::PDFEbook.logger.info("Publication.from_path_id(#{path}) raised #{e} #{e.backtrace}")
nil
end

# Public method
def intervals
@intervals ||= extract_intervals
end

private

def initialize(file, id)
@pdf = Origami::PDF.read(file, verbosity: Origami::Parser::VERBOSE_QUIET)
@id = id
@obj_to_page = {}
end

def extract_intervals
# Map of PDF page object number to 0-based linear page number
if @obj_to_page.empty?
@pdf.pages.each_with_index do |p, i|
@obj_to_page[p.no] = i
end
end
iterate_outlines(@pdf.Catalog.Outlines[:First]&.solve, 1)
end

# Takes Origami::OutlineItem and 1-based depth
def iterate_outlines(outline, depth)
intervals = []
until outline.nil?
page = nil
page = outline&.[](:A)&.solve&.[](:D)&.[](0)&.solve # Origami::Page
page ||= outline[:Dest]&.solve&.[](0)&.solve
unless page.nil?
page_number = @obj_to_page[page.no] || 0
intervals << PDFEbook::Interval.from_title_level_cfi(outline[:Title].to_utf8, depth, "page=#{page_number}")
end
unless outline[:First]&.solve.nil? # Child outline
intervals += iterate_outlines(outline[:First].solve, depth + 1)
end
outline = outline[:Next]&.solve
end
intervals
end
end
end
Binary file added lib/spec/fixtures/fake_pdf01.pdf
Binary file not shown.
38 changes: 38 additions & 0 deletions lib/spec/pdf_ebook/interval_spec.rb
@@ -0,0 +1,38 @@
# frozen_string_literal: true

RSpec.describe PDFEbook::Interval do
describe '#new' do
it { expect { is_expected }.to raise_error(NoMethodError) }
end

describe '#null_object' do
subject { described_class.null_object }

it { is_expected.to be_an_instance_of(PDFEbook::IntervalNullObject) }
it { expect(subject.title).to be_empty }
it { expect(subject.level).to be_zero }
it { expect(subject.cfi).to be_empty }
it { expect(subject.downloadable?).to be false }
it { expect(subject.pages).to be_empty }
end

describe '#from_title_level_cfi' do
subject { described_class.from_title_level_cfi(title, level, cfi) }

let(:title) { double('title') }
let(:level) { double('level') }
let(:cfi) { double('cfi') }
let(:interval) { double('interval', cfi: cfi, title: title) }

it { is_expected.to be_an_instance_of(PDFEbook::IntervalNullObject) }

context 'Strings' do
before do
allow(cfi).to receive(:instance_of?).with(String).and_return(true)
allow(title).to receive(:instance_of?).with(String).and_return(true)
end

it { is_expected.to be_an_instance_of(described_class) }
end
end
end
54 changes: 54 additions & 0 deletions lib/spec/pdf_ebook/publication_spec.rb
@@ -0,0 +1,54 @@
# frozen_string_literal: true

RSpec.describe PDFEbook::Publication do
describe '#new' do
it { expect { is_expected }.to raise_error(NoMethodError) }
end

describe "with a test PDF" do
context "using #from_path_id" do
before do
@noid = '99999999'
@file = './spec/fixtures/fake_pdf01.pdf'
end

describe "#intervals" do
subject { described_class.from_path_id(@file, @noid) }

it { is_expected.to be_an_instance_of(described_class) }

it "has 5 intervals" do
expect(subject.intervals.count).to be 5
end

describe "interval 1" do
subject { described_class.from_path_id(@file, @noid).intervals[0] }

it "has title Front Cover" do
expect(subject.title).to eq "Front Cover"
end
it "has level 1" do
expect(subject.level).to eq 1
end
it "has the cfi of" do
expect(subject.cfi).to eq 'page=0'
end
end

describe "interval 4" do
subject { described_class.from_path_id(@file, @noid).intervals[3] }

it "has title Front Cover" do
expect(subject.title).to eq "Section 2.1"
end
it "has level 2" do
expect(subject.level).to eq 2
end
it "has the cfi of" do
expect(subject.cfi).to eq 'page=5'
end
end
end
end
end
end
30 changes: 30 additions & 0 deletions lib/spec/pdf_ebook_spec.rb
@@ -0,0 +1,30 @@
# frozen_string_literal: true

RSpec.describe PDFEbook do
describe '#logger' do
it 'attribute getter' do
expect { described_class.logger }.not_to raise_error
end
it 'attribute setter' do
expect { described_class.logger = nil }.not_to raise_error
end
end

describe '#configure' do
before { described_class.reset_configured_flag }

it 'setup block yields subject' do
setup_config = nil
described_class.configure do |config|
setup_config = config
end
is_expected.to eq setup_config
end

it 'subject is configured' do
described_class.configure do |config|
end
expect(subject.configured?).to be true
end
end
end

0 comments on commit b64e3ec

Please sign in to comment.