Skip to content

Commit

Permalink
Ingest CLI (rake task) and Logging (#150)
Browse files Browse the repository at this point in the history
* NDNP BatchIngester has alternate constructor intended for command-line
use.

* Rake task for NDNP ingest

* NDNP PageIngester generates TIFF from PDF when TIFF is missing.

* BatchIngester whitelists the batch directory, when called from CLI.

* Clean up NDNP title/publication creation, don't duplicate copy of
metadata.

* Logging mixin for ingest logging

* ALTO fixtures (unscaled px units, and 4X scaled points) generated by OCR of ocr_gray.tiff

* RenderALTO class supports scaling for use in generating test fixtures
  • Loading branch information
seanupton authored and ebenenglish committed May 30, 2019
1 parent 8eb580c commit fb6879a
Show file tree
Hide file tree
Showing 23 changed files with 1,017 additions and 57 deletions.
3 changes: 3 additions & 0 deletions .rubocop.yml
Expand Up @@ -3,6 +3,9 @@ inherit_gem:

inherit_from: .rubocop_fixme.yml

RSpec/ExampleLength:
Max: 15

AllCops:
TargetRubyVersion: 2.1
DisplayCopNames: true
Expand Down
Expand Up @@ -55,8 +55,14 @@ def create_derivatives(_filename)
# as this plugin makes derivatives of derivative, _filename is ignored
source_file = alto
return if source_file.nil?
# Image width from characterized primary file helps ensure proper scaling:
file = @file_set.original_file
width = file.nil? ? nil : file.width[0].to_i
# ALTOReader is responsible for transcoding, this class just saves result
reader = NewspaperWorks::TextExtraction::AltoReader.new(source_file)
reader = NewspaperWorks::TextExtraction::AltoReader.new(
source_file,
width
)
save_derivative('json', reader.json)
save_derivative('txt', reader.text)
end
Expand Down
1 change: 1 addition & 0 deletions lib/newspaper_works.rb
Expand Up @@ -4,6 +4,7 @@
require "newspaper_works/data"
require "newspaper_works/configuration"
require "newspaper_works/page_finder"
require "newspaper_works/logging"

# Newspaper works modules
module NewspaperWorks
Expand Down
61 changes: 50 additions & 11 deletions lib/newspaper_works/ingest/ndnp/batch_ingester.rb
@@ -1,21 +1,69 @@
require 'find'
require 'date'
require 'find'
require 'optparse'

module NewspaperWorks
module Ingest
module NDNP
class BatchIngester
include NewspaperWorks::Logging

attr_accessor :path, :batch

# alternate constructor from ARGV
# @param options [Array<String>]
def self.from_command(options, cmd_name)
path = batch_path(options, cmd_name)
missing_path(cmd_name) if path.nil?
path = xml_path(path)
missing_path(cmd_name, "Not found: #{path}") unless File.exist?(path)
Hyrax.config.whitelisted_ingest_dirs.push(File.dirname(path))
new(path)
end

def self.missing_path(cmd_name, msg = "Missing path argument")
STDERR.puts "Usage: #{cmd_name} -- --path=PATH"
STDERR.puts "#{msg}. Exiting."
# rubocop:disable Rails/Exit
exit(1) if cmd_name.start_with?('rake')
# rubocop:enable Rails/Exit
end

def self.batch_path(options, cmd_name)
path = nil
parser = OptionParser.new
args = parser.order!(options) {}
parser.banner = "Usage: #{cmd_name} -- --path=PATH"
parser.on('-i PATH', '--path PATH') do |p|
path = p
end
parser.parse!(args)
path
end

def self.xml_path(path)
return path unless File.directory?(path)
batch_xml_path = Find.find(path).select do |f|
f.downcase.end_with?('batch_1.xml', 'batch.xml')
end
batch_xml_path.find { |f| f.end_with?('_1.xml') } || batch_xml_path[0]
end

def initialize(path)
@path = xml_path(path)
@path = self.class.xml_path(path)
raise IOError, "No batch file found: #{path}" if @path.empty?
@batch = batch_enumerator
configure_logger('ingest')
end

def ingest
write_log("Beginning NDNP batch ingest for #{@path}")
batch.each do |issue|
issue_ingester(issue).ingest
end
write_log(
"NDNP batch ingest complete!"
)
end

private
Expand All @@ -32,15 +80,6 @@ def issue_ingester(issue)
def normalize_date(v)
(v.is_a?(String) ? Date.parse(v) : v).to_s
end

def xml_path(path)
return path unless File.directory?(path)
batch_path = Find.find(path).select do |f|
f.downcase.end_with?('batch_1.xml')
end
raise IOError, 'Batch file not found: #{path}' if batch_path.empty?
batch_path[0]
end
end
end
end
Expand Down
4 changes: 2 additions & 2 deletions lib/newspaper_works/ingest/ndnp/container_ingester.rb
Expand Up @@ -26,7 +26,7 @@ def ingest
# Link a page to target container
# @param page [NewspaperPage]
def link(page)
@target.members << page
@target.ordered_members << page
# save each link attempt (for now no deferring/bundling)
@target.save!
end
Expand Down Expand Up @@ -69,7 +69,7 @@ def copy_metadata

def link_publication
return unless @target.publication.nil?
@publication.members << @target
@publication.ordered_members << @target
@publication.save!
end
end
Expand Down
34 changes: 28 additions & 6 deletions lib/newspaper_works/ingest/ndnp/issue_ingester.rb
Expand Up @@ -2,6 +2,8 @@ module NewspaperWorks
module Ingest
module NDNP
class IssueIngester
include NewspaperWorks::Logging

attr_accessor :batch, :issue, :target

delegate :path, to: :issue
Expand All @@ -23,6 +25,7 @@ def initialize(issue, batch = nil)
@issue = issue
@batch = batch
@target = nil
configure_logger('ingest')
end

def ingest
Expand All @@ -37,13 +40,13 @@ def construct_issue

def ingest_pages
issue.each do |page|
NewspaperWorks::Ingest::NDNP::PageIngester.new(page, @target).ingest
page_ingest(page)
end
end

private

def page_ingester(page_data)
def page_ingest(page_data)
NewspaperWorks::Ingest::NDNP::PageIngester.new(
page_data,
@target
Expand All @@ -70,6 +73,7 @@ def create_issue
@target = NewspaperIssue.create
copy_issue_metadata
@target.save!
write_log("Saved metadata to new NewspaperIssue #{@target.id}")
end

# @param lccn [String] Library of Congress Control Number
Expand All @@ -87,14 +91,32 @@ def copy_publication_title(publication)
publication.place_of_publication = [uri] unless uri.nil?
end

def create_publication(lccn)
publication = NewspaperTitle.create
copy_publication_title(publication)
publication.lccn ||= lccn
publication.save!
write_log(
"Created NewspaperTitle work #{publication.id} for LCCN #{lccn}"
)
publication
end

def find_or_create_linked_publication
lccn = issue.metadata.lccn
publication = find_publication(lccn)
publication = NewspaperTitle.create if publication.nil?
copy_publication_title(publication)
publication.lccn ||= lccn
publication.members << @target
unless publication.nil?
write_log(
"Found existing NewspaperTitle #{publication.id}, LCCN #{lccn}"
)
end
publication = create_publication(lccn) if publication.nil?
publication.ordered_members << @target
publication.save!
write_log(
"Linked NewspaperIssue #{@target.id} to "\
"NewspaperTitle work #{publication.id}"
)
end
end
end
Expand Down
64 changes: 57 additions & 7 deletions lib/newspaper_works/ingest/ndnp/page_ingester.rb
@@ -1,7 +1,11 @@
require 'newspaper_works/logging'

module NewspaperWorks
module Ingest
module NDNP
# rubocop:disable Metrics/ClassLength
class PageIngester
include NewspaperWorks::Logging
attr_accessor :page, :issue, :target

delegate :path, :dmdid, to: :page
Expand All @@ -25,6 +29,8 @@ def initialize(page, issue)
@issue = issue
# target is to-be-created NewspaperPage:
@target = nil
@work_files = nil
configure_logger('ingest')
end

def ingest
Expand All @@ -34,28 +40,34 @@ def ingest
end

def construct_page
@target = NewspaperPage.create
@target.title = page_title
@target = NewspaperPage.create!(title: page_title)
write_log(
"Created NewspaperPage work #{@target.id} "\
"with title '#{@target.title[0]}'"
)
copy_page_metadata
link_issue
@target.save!
write_log("Saved metadata to NewspaperPage work #{@target.id}")
end

# Ingest primary, derivative files; other derivatives including
# thumbnail, plain-text, json will be made by NewspaperWorks
# derivative service components as a consequence of commiting
# files assigned (via actor stack, via WorkFiles).
def ingest_page_files
work_files = NewspaperWorks::Data::WorkFiles.new(@target)
@work_files = NewspaperWorks::Data::WorkFiles.new(@target)
page.files.each do |path|
ext = path.downcase.split('.')[-1]
if ['tif', 'tiff'].include?(ext)
work_files.assign(path)
ingest_primary_file(path)
else
work_files.derivatives.assign(path)
ingest_derivative_file(path)
end
end
work_files.commit!
write_log("Beginning file attachment process (WorkFiles.commit!) "\
"for work #{@target.id}")
@work_files.commit!
end

def link_reel
Expand All @@ -73,9 +85,46 @@ def link_reel

private

def ingest_primary_file(path)
unless File.exist?(path)
pdf_path = page.files.select { |p| p.end_with?('pdf') }[0]
# make and get TIFF path (to generated tmp file):
path = make_tiff(pdf_path)
end
write_log("Assigned primary file to work #{@target.id}, #{path}")
@work_files.assign(path)
end

def ingest_derivative_file(path)
write_log("Assigned derivative file to work #{@target.id}, #{path}")
@work_files.derivatives.assign(path)
end

def link_issue
issue.members << @target # page
issue.ordered_members << @target # page
issue.save!
write_log(
"Linked NewspaperIssue work #{issue.id} "\
"to NewspaperPage work #{@target.id}"
)
end

# dir whitelist
def whitelist
Hyrax.config.whitelisted_ingest_dirs
end

# Generate TIFF in temporary file, return its path, given path to PDF
# @param pdf_path [String] path to single-page PDF
# @return [String] path to generated TIFF
def make_tiff(pdf_path)
write_log(
"Creating TIFF from PDF in lieu of missing for work "\
" (#{@target.id})",
Logger::WARN
)
whitelist.push(Dir.tmpdir) unless whitelist.include?(Dir.tmpdir)
NewspaperWorks::Ingest::PdfPages.new(pdf_path).to_a[0]
end

# Page title as issue title plus page title
Expand All @@ -96,6 +145,7 @@ def copy_page_metadata
end
end
end
# rubocop:enable Metrics/ClassLength
end
end
end
11 changes: 7 additions & 4 deletions lib/newspaper_works/ingest/ndnp/page_metadata.rb
Expand Up @@ -31,17 +31,20 @@ def inspect
# "Number" is used liberaly, and may contain both alpha
# and numeric characters. As such, return value is String.
#
# Recommendation: callers may (strongly recommended) fall back to:
# `page.page_number || page.page_sequence_number.to_s`,
# however, this is not implemented automatically by this class.
# If NDNP issue data fails to provide an explicitly
# human-readable page number, fallback to sequence
# number, in String form.
#
# @return [String, NilClass] Page "number" string
def page_number
detail = dmd_node.xpath(
".//mods:mods//mods:detail[@type='page number']",
**XML_NS
)
return nil if detail.size.zero?
if detail.size.zero?
fallback = page_sequence_number
return fallback.nil? ? nil : fallback.to_s
end
detail.xpath("mods:number", **XML_NS).first.text
end

Expand Down

0 comments on commit fb6879a

Please sign in to comment.