Skip to content

Commit

Permalink
batch 9 jpg
Browse files Browse the repository at this point in the history
  • Loading branch information
mnyrop committed Nov 10, 2023
1 parent 0dbac8b commit f752a51
Show file tree
Hide file tree
Showing 1,740 changed files with 25,993 additions and 54 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ build
aperitiiif
aperitiiif-cli

src/data-original/
src/pdfs/
8 changes: 6 additions & 2 deletions Gemfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
source 'https://rubygems.org'

gem 'aperitiiif', github: 'middlicomp/aperitiiif-cli', tag: 'v0.1.2'
gem 'pdf-reader'
gem 'aperitiiif', github: 'middlicomp/aperitiiif-cli', branch: 'main' # tag: 'v0.1.2'
# gem 'aperitiiif', path: 'aperitiiif-cli'

group :development do
gem 'pdf-reader'
end
56 changes: 31 additions & 25 deletions Gemfile.lock
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
GIT
remote: https://github.com/middlicomp/aperitiiif-cli.git
revision: a91d2e30228ef223348713a021adef9422d98eee
tag: v0.1.2
revision: f63fe7eceffe0ffdc1c2d8f787ac63e9362853df
branch: main
specs:
aperitiiif (0.1.2)
colorize
iiif-presentation
iiif-presentation (= 1.1)
liquid
mimemagic
parallel
Expand All @@ -18,21 +18,32 @@ GEM
remote: https://rubygems.org/
specs:
Ascii85 (1.1.0)
activesupport (7.0.4.2)
activesupport (7.1.1)
base64
bigdecimal
concurrent-ruby (~> 1.0, >= 1.0.2)
connection_pool (>= 2.2.5)
drb
i18n (>= 1.6, < 2)
minitest (>= 5.1)
mutex_m
tzinfo (~> 2.0)
afm (0.2.2)
colorize (0.8.1)
concurrent-ruby (1.2.0)
faraday (2.7.4)
base64 (0.2.0)
bigdecimal (3.1.4)
colorize (1.1.0)
concurrent-ruby (1.2.2)
connection_pool (2.4.1)
drb (2.2.0)
ruby2_keywords
faraday (2.7.11)
base64
faraday-net_http (>= 2.0, < 3.1)
ruby2_keywords (>= 0.0.4)
faraday-net_http (3.0.2)
ffi (1.15.5)
ffi (1.16.3)
hashery (2.1.2)
i18n (1.12.0)
i18n (1.14.1)
concurrent-ruby (~> 1.0)
iiif-presentation (1.1.0)
activesupport (>= 3.2.18)
Expand All @@ -43,41 +54,36 @@ GEM
mimemagic (0.4.3)
nokogiri (~> 1)
rake
minitest (5.17.0)
nokogiri (1.14.0-arm64-darwin)
minitest (5.20.0)
mutex_m (0.2.0)
nokogiri (1.15.4-arm64-darwin)
racc (~> 1.4)
nokogiri (1.14.0-x86_64-darwin)
racc (~> 1.4)
nokogiri (1.14.0-x86_64-linux)
racc (~> 1.4)
parallel (1.22.1)
parallel (1.23.0)
pdf-reader (2.11.0)
Ascii85 (~> 1.0)
afm (~> 0.2.1)
hashery (~> 2.0)
ruby-rc4
ttfunk
racc (1.6.2)
rake (13.0.6)
ruby-progressbar (1.11.0)
racc (1.7.3)
rake (13.1.0)
ruby-progressbar (1.13.0)
ruby-rc4 (0.1.5)
ruby-vips (2.1.4)
ruby-vips (2.2.0)
ffi (~> 1.12)
ruby2_keywords (0.0.5)
safe_yaml (1.0.5)
thor (1.2.1)
thor (1.3.0)
ttfunk (1.7.0)
tzinfo (2.0.5)
tzinfo (2.0.6)
concurrent-ruby (~> 1.0)

PLATFORMS
arm64-darwin-22
x86_64-darwin-21
x86_64-linux

DEPENDENCIES
aperitiiif!
pdf-reader

BUNDLED WITH
2.3.12
2.3.24
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ TO DO
- [@mnrop](https://github.com/mnyrop)

## Notes
- Run csv initialization creation script with `bundle exec ruby lib/create-csv.rb`
- Run csv initialization & pdf splitting script with `bundle exec ruby lib/split-pdfs-populate-csv.rb`
25 changes: 0 additions & 25 deletions lib/create-csv.rb

This file was deleted.

45 changes: 45 additions & 0 deletions lib/split-pdfs-populate-csv.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
require 'fileutils'
require 'pdf-reader'
# require 'ruby-progressbar'
require 'vips'

csv_file = './src/records.csv'
pdf_dir = './src/pdfs/'
data_dir = './src/data/'
pdfs = Dir.glob("#{pdf_dir}*.pdf")
pdfs_count = pdfs.length

# write start of csv
File.open(csv_file, 'w') do |file|
file.puts("id,label,a_number,parent_pdf_id,redacted,extracted_text")
end

FileUtils.mkdir_p data_dir

# process data
data = pdfs.each_with_index do |path, i|
GC.start
reader = PDF::Reader.new(path)
parent_page_count = reader.page_count
parent_pdf_id = path.sub(pdf_dir, '').sub('.pdf', '')
redacted = parent_pdf_id.include? 'redacted'
a_number = parent_pdf_id.sub('_redacted', '')

(0..parent_page_count - 1).each do |index|
page_num = index.to_s.rjust(4, "0")
id = "#{parent_pdf_id}_#{page_num}"
target = "#{data_dir}#{id}.jpg"
text = reader.pages[index].text.to_s.gsub(/\R+/, "|").gsub('"', "'")
data = [id,id,a_number,parent_pdf_id,redacted,"\"#{text}\""]

File.open(csv_file, "a") { |file| file.puts data.join(',') }

# return if File.exist? target

img = Vips::Image.new_from_file path, page: index
img.jpegsave target

puts "wrote #{target}; page #{index} / #{parent_page_count}"
end
puts "finished pdf #{i} / #{pdfs_count}#{i.to_f / pdfs_count.to_f * 100.0}% complete"
end
Loading

0 comments on commit f752a51

Please sign in to comment.