Skip to content

Commit

Permalink
Try to escape invalid UTF-8 characters during harvesting.
Browse files Browse the repository at this point in the history
git-svn-id: http://oai.rubyforge.org/svn/trunk@948 4dc5e89f-90f6-0310-ab54-a6a856e7c30e
  • Loading branch information
wilig committed Nov 26, 2007
1 parent 254c7ce commit c807455
Show file tree
Hide file tree
Showing 6 changed files with 33 additions and 22 deletions.
36 changes: 17 additions & 19 deletions lib/oai/client.rb
Expand Up @@ -155,7 +155,7 @@ def list_sets(opts={})
def do_request(verb, opts = nil)
# fire off the request and return appropriate DOM object
uri = build_uri(verb, opts)
xml = get(uri)
xml = strip_invalid_utf_8_chars(get(uri))
if @parser == 'libxml'
# remove default namespace for oai-pmh since libxml
# isn't able to use our xpaths to get at them
Expand Down Expand Up @@ -184,36 +184,20 @@ def encode(value)
end

def load_document(xml)
retried = false
case @parser
when 'libxml'
begin
parser = XML::Parser.new()
parser.string = xml
return parser.parse
rescue XML::Parser::ParseError => e
if retried
raise OAI::Exception, 'response not well formed XML: '+e, caller
end
ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
xml2 = ic.iconv(xml << ' ')[0..-2]
puts "equal? #{xml == xml2}"
retried = true
retry
raise OAI::Exception, 'response not well formed XML: '+e, caller
end
when 'rexml'
begin
return REXML::Document.new(xml)
rescue REXML::ParseException => e
if retried
puts xml
raise OAI::Exception, 'response not well formed XML: '+e, caller
end
puts "RETRYING"
ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
xml = ic.iconv(xml << ' ')[0..-2]
retried = true
retry
raise OAI::Exception, 'response not well formed XML: '+e, caller
end
end
end
Expand Down Expand Up @@ -296,5 +280,19 @@ def parse_date(value)
dt.utc
end


# Strip out invalid UTF-8 characters. Regex from the W3C, inverted.
# http://www.w3.org/International/questions/qa-forms-utf-8.en.php
def strip_invalid_utf_8_chars(xml)
simple_bytes = xml.gsub(/[\x00-\x08\x10\x0B\x0C\x0E-\x19\x7F]
| [\x00-\x7F][\x80-\xBF]+
| ([\xC0\xC1]|[\xF0-\xFF])[\x80-\xBF]*
| [\xC2-\xDF]((?![\x80-\xBF])|[\x80-\xBF]{2,})
| [\xE0-\xEF](([\x80-\xBF](?![\x80-\xBF]))
| (?![\x80-\xBF]{2})|[\x80-\xBF]{3,})/x, '?')
simple_bytes.gsub(/\xE0[\x80-\x9F][\x80-\xBF]
| \xED[\xA0-\xBF][\x80-\xBF]/,'?')
end

end
end
1 change: 1 addition & 0 deletions lib/oai/harvester.rb
Expand Up @@ -9,6 +9,7 @@
require 'chronic'
require 'socket'

require 'oai/client'
require 'oai/harvester/config'
require 'oai/harvester/harvest'
require 'oai/harvester/logging'
Expand Down
2 changes: 1 addition & 1 deletion lib/oai/harvester/harvest.rb
Expand Up @@ -107,7 +107,7 @@ def call(url, opts)
end

def get_records(doc)
doc.find("/OAI-PMH/ListRecords/record").to_a
doc.doc.root.elements.to_a("/OAI-PMH/ListRecords/record")
end

def build_options_hash(site)
Expand Down
4 changes: 2 additions & 2 deletions lib/oai/harvester/shell.rb
Expand Up @@ -47,8 +47,8 @@ def start
end
rescue
puts "Not a recognized command, or bad options. Type 'help' for clues."
#puts $!
#puts $!.backtrace.join("\n")
puts $!
puts $!.backtrace.join("\n")
end
end
end
Expand Down
1 change: 1 addition & 0 deletions test/client/tc_list_records.rb
@@ -1,6 +1,7 @@
require 'test_helper'

class GetRecordsTest < Test::Unit::TestCase

def test_get_records
client = OAI::Client.new 'http://localhost:3333/oai'
response = client.list_records
Expand Down
11 changes: 11 additions & 0 deletions test/client/tc_utf8_escaping.rb
@@ -0,0 +1,11 @@
require 'test_helper'

class UTF8Test < Test::Unit::TestCase

def test_escaping_invalid_utf_8_characters
client = OAI::Client.new 'http://localhost:3333/oai', :parser => 'libxml'
invalid_utf_8 = [2, 3, 4, 104, 5, 101, 6, 108, 66897, 108, 66535, 111, 1114112, 33, 55234123, 33].pack("U*")
assert_equal("hello!!", client.send(:strip_invalid_utf_8_chars, invalid_utf_8).gsub(/\?/, ''))
end

end

0 comments on commit c807455

Please sign in to comment.