Skip to content

Commit

Permalink
Fix problem with spaces and spans in FI
Browse files Browse the repository at this point in the history
  • Loading branch information
ngiger committed Jul 26, 2013
1 parent d03a007 commit acc8644
Show file tree
Hide file tree
Showing 5 changed files with 440 additions and 203 deletions.
4 changes: 2 additions & 2 deletions ext/fiparse/src/fachinfo_hpricot.rb
Expand Up @@ -109,13 +109,13 @@ def to_textinfo
end
private
def detect_chapter(elem)
return [nil, nil] unless elem.attributes['id'].to_s =~ /^section[0-9]*$/
return [nil, nil] unless /^section[0-9]*$/i.match(elem.attributes['id'].to_s)
# TODO
# Update chapter detection if swissmedic repairs FI/PI format.
#
# Currently, id attribute 'section*' is not fixed number.
# And Section order is also not fixed :(
text = text(elem)
text = text(elem).sub(/^\s/, '')
code =
case text
when /^Zusammensetzung(en)?|^Composition[s]?/ ; '7000'
Expand Down
16 changes: 10 additions & 6 deletions ext/fiparse/src/textinfo_hpricot.rb
Expand Up @@ -73,6 +73,11 @@ def extract(doc, type=:fi, name=nil, styles = nil)
end
(doc/paragraph_tag).each { |elem|
identify_chapter(*chapter(elem)) if !name or elem != name
}
paragraph_tag_pre_2013 = "div[@id^='Section']"
(doc/paragraph_tag_pre_2013).each {
|elem|
identify_chapter(*chapter(elem)) if !name or elem != name
}
to_textinfo
end
Expand Down Expand Up @@ -120,8 +125,7 @@ def detect_text_block(elem) # for swissmedicinfo format
text
end
def handle_element(child, ptr, isParagraph=false)
# puts "handle_element #{child.class} #{child.name} isParagraph #{isParagraph}"
ptr.target << ' ' if isParagraph and !/^Zulassungsnummer[n]?|^Num.ro\s*d.autorisation/.match(ptr.chapter.to_s)
ptr.target << ' ' if self.class.eql?(ODDB::FiParse::PatinfoHpricot) and isParagraph and !/^Zulassungsnummer[n]?|^Num.ro\s*d.autorisation/.match(ptr.chapter.to_s)
case child
when Hpricot::Text
if ptr.target.is_a? Text::Table
Expand Down Expand Up @@ -167,9 +171,7 @@ def handle_element(child, ptr, isParagraph=false)
ptr.target.reduce_format(:italic) if has_italic?(child, ptr)
end
when 'sub', 'sup'
ptr.target << ' '
handle_text(ptr, child)
ptr.target << ' '
when 'table'
ptr.section = ptr.chapter.next_section
if detect_table?(child)
Expand Down Expand Up @@ -288,7 +290,7 @@ def simple_chapter(elem_or_str)
if(elem_or_str)
chapter = Text::Chapter.new
if elem_or_str.is_a?(Hpricot::Elem)
chapter.heading = text(elem_or_str)
chapter.heading = text(elem_or_str).strip
elsif elem_or_str.is_a?(String)
chapter.heading = elem_or_str
end
Expand Down Expand Up @@ -332,7 +334,9 @@ def target_encoding(text)
def text(elem)
return '' unless elem
str = elem.inner_text || elem.to_s
target_encoding(str.gsub(/(&nbsp;|\s)+/u, ' ').gsub(/[■]/u, '').strip)
res = target_encoding(str.gsub(/(&nbsp;|\s)+/u, ' ').gsub(/[■]/u, '').gsub(' ', ' '))
res.strip! if self.class.to_s.eql?('ODDB::FiParse::PatinfoHpricot')
res
end
end
end
Expand Down

0 comments on commit acc8644

Please sign in to comment.