In [1]:
import pandas as pd
from __future__ import print_function
import fitz
import sys
from operator import itemgetter
import re

In [2]:
#declare the path of your file
file_path = "../data/586346.pdf"  #/pdf_file/data.pdf
standard_headers = [
    r"(\d+.*identification)",
    r"(\d+.*hazard)",
    r"(\d+.*composition)",
    r"(\d+.*first.aid)",
    r"(\d+.*fire.fighting)",
    r"(\d+.*accidental release)",
    r"(\d+.*handling)",
    r"(\d+.*exposure)",
    r"(\d+.*physical and chemical)",
    r"(\d+.*stability and reactivity)",
    r"(\d+.*toxicological)",
    r"(\d+.*ecological)",
    r"(\d+.*disposal)",
    r"(\d+.*transport)",
    r"(\d+.*regulatory)",
    r"(\d+.*other information)"
]

In [3]:
doc = fitz.open(file_path)

In [4]:
# def parse_to_html(pdf):
#     """
#     Parses pdf file to html object
#     Use filepath of pdf as argument
#     """
#     doc = fitz.open(pdf)
#     for page in doc:
#         html_content = page.getText("html")
#     return html_content

In [5]:
def get_font_style_counts(doc, granularity=False):
    """Extracts fonts and their usage in PDF documents.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
    :type granularity: bool
    :rtype: [(font_size, count), (font_size, count}], dict
    :return: most used fonts sorted by count, font style information
    """
    styles = {}
    font_counts = {}

    for page in doc:
        blocks = page.getText("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if granularity:
                            identifier = (int(s['text'].isupper()), int('bold' in s['font'].lower()), float(s['size'])) #"{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                            styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                  'color': s['color']}
                        else:
                            identifier = "{0}".format(s['size'])
                            styles[identifier] = {'size': s['size'], 'font': s['font']}

                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage
    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles

In [6]:
def get_font_tags(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.
    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict
    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = font_counts[0][0] # get style for most used font by count (paragraph)

    # sorting the font sizes high to low, so that we can append the right integer to each tag 
    font_styles = []
    for ((upper, bold, font_size), count) in font_counts:
        font_styles.append((upper, bold, font_size))
    font_styles.sort(key=itemgetter(0,2,1), reverse=True)
#     return font_styles

    # aggregating the tags for each font size
    idx = 0
    style_tag = {}
    for style in font_styles:
        idx += 1
        if style == p_style:
            idx = 0
            style_tag[style] = '<p>'
            continue
        if style[2] > p_style[2]:
            style_tag[style] = '<h{0}>'.format(idx)
        elif style[2] < p_style[2]:
            style_tag[style] = '<s{0}>'.format(idx)
        else:
            style_tag[style] = '<h{0}>'.format(idx)
    return style_tag

In [7]:
def assign_tags_to_content(doc, style_tag):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param style_tag: textual element tags for each style (uppercase_flag, bold_flag, size)
    :type style_tag: dict
    :rtype: list
    :return: texts with pre-prended element tags
    """
    header_para = []  # list with headers and paragraphs
    header_dict = {}
    first = True  # boolean operator for first header
    previous_s = {}  # previous span

    for page in doc:
        blocks = page.getText("dict")["blocks"]
        i=0
        for b in blocks:  # iterate through the text blocks
#             i+=1
#             if i == 4:
#                 break
#             print(i)
#             pp.pprint(b)

            if b['type'] == 0:  # this block contains text

                # REMEMBER: multiple fonts and sizes are possible IN one block

                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['text'].strip():  # removing whitespaces:
                            if first:
                                previous_s = s
                                first = False
                                s_key = (int(s['text'].isupper()), int('bold' in s['font'].lower()), float(s['size']))
                                block_string = style_tag[s_key] + s['text']
                            else:
                                s_key = (int(s['text'].isupper()), int('bold' in s['font'].lower()), float(s['size']))
                                previous_key = (int(previous_s['text'].isupper()), int('bold' in previous_s['font'].lower()), float(previous_s['size']))
                                if s_key == previous_key:

                                    if block_string and all((c == "|") for c in block_string):
                                        # block_string only contains pipes
                                        block_string = style_tag[s_key] + s['text']
                                    if block_string == "":
                                        # new block has started, so append size tag
                                        block_string = style_tag[s_key] + s['text']
                                    else:  # in the same block, so concatenate strings
                                        block_string += " " + s['text']

                                else:
                                    header_para.append(block_string)
                                    if block_string.startswith("<h"):
                                        if style_tag[previous_key] in header_dict: 
                                            header_dict[style_tag[previous_key]].append(block_string[block_string.index(">")+1:])
                                        else:
                                            header_dict[style_tag[previous_key]] = [block_string[block_string.index(">")+1:]]
                                    block_string = style_tag[s_key] + s['text']

                                previous_s = s

                    # new block started, indicating with a pipe
                    block_string += "|"

                header_para.append(block_string)
                if block_string.startswith("<h"):
                    if style_tag[s_key] in header_dict: 
                        header_dict[style_tag[s_key]].append(block_string[block_string.index(">")+1:])
                    else:
                        header_dict[style_tag[s_key]] = [block_string[block_string.index(">")+1:]]
    return header_para, header_dict

In [8]:
font_counts, styles = get_font_style_counts(doc, granularity=True)

In [9]:
font_counts

[((0, 0, 9.691374778747559), 561),
 ((0, 1, 9.691374778747559), 58),
 ((1, 1, 9.691374778747559), 19),
 ((0, 0, 10.742137908935547), 18),
 ((1, 0, 9.691374778747559), 12),
 ((0, 0, 12.610512733459473), 9),
 ((0, 1, 12.610512733459473), 9),
 ((1, 1, 13.661530494689941), 9),
 ((0, 0, 11.676494598388672), 3),
 ((0, 0, 10.275195121765137), 2)]

In [10]:
style_tag = get_font_tags(font_counts, styles)
style_tag

{(1, 1, 13.661530494689941): '<h1>',
 (1, 1, 9.691374778747559): '<h2>',
 (1, 0, 9.691374778747559): '<h3>',
 (0, 1, 12.610512733459473): '<h4>',
 (0, 0, 12.610512733459473): '<h5>',
 (0, 0, 11.676494598388672): '<h6>',
 (0, 0, 10.742137908935547): '<h7>',
 (0, 0, 10.275195121765137): '<h8>',
 (0, 1, 9.691374778747559): '<h9>',
 (0, 0, 9.691374778747559): '<p>'}

In [11]:
tagged_text, header_dict = assign_tags_to_content(doc, style_tag)
tagged_text

['<h5>Material Safety Data Sheet |',
 '||',
 '',
 '<h1>BYK-349 |',
 '',
 '<p>Version 5| Revision Date 02/04/2011| Print Date 02/04/2011  |',
 '||',
 '<p>1 / 9 |',
 '',
 '<h2>SECTION 1. PRODUCT AND COMPANY IDENTIFICATION |',
 '',
 '<p>Product name | : |',
 '<h3>BYK-349|',
 '',
 '<p>Product Use Description | : | Substrate Wetting Additive|',
 '<p>Company  Company | : | BYK USA Inc. | 524 South Cherry Street | Wallingford CT 06492 | Prepared by | : | J.Nole, Safety; M.McCutcheon, Regulatory | Telephone | : | (203) 265-2086 | Visit our web site | : | www.byk.com | E-mail address | : | ehs.byk.usa@altana.com|',
 '<p>Emergency telephone | number | : |',
 '<h3>CHEMTREC 800-424-9300 |',
 '',
 '<h2>SECTION 2. HAZARDS IDENTIFICATION |',
 '',
 '<h9>Emergency Overview |',
 '',
 '<p>Form | : liquid | Colour | : light brown | Odour | : not significant||',
 '<h9>OSHA Regulatory Status ||',
 '<p>This material is considered hazardous by the OSHA Hazard Communication Standard (29 |',
 '<h3>CFR1910.1200)

In [12]:
header_dict

{'<h5>': ['Material Safety Data Sheet |',
  'Material Safety Data Sheet |',
  'Material Safety Data Sheet |',
  'Material Safety Data Sheet |',
  'Material Safety Data Sheet |',
  'Material Safety Data Sheet |',
  'Material Safety Data Sheet |',
  'Material Safety Data Sheet |',
  'Material Safety Data Sheet |'],
 '<h1>': ['BYK-349 |',
  'BYK-349 |',
  'BYK-349 |',
  'BYK-349 |',
  'BYK-349 |',
  'BYK-349 |',
  'BYK-349 |',
  'BYK-349 |',
  'BYK-349 |'],
 '<h2>': ['SECTION 1. PRODUCT AND COMPANY IDENTIFICATION |',
  'SECTION 2. HAZARDS IDENTIFICATION |',
  'SECTION 3. COMPOSITION/INFORMATION ON INGREDIENTS |',
  'SECTION 4. FIRST AID MEASURES |',
  'SECTION 5. FIRE-FIGHTING MEASURES |',
  'SECTION 6. ACCIDENTAL RELEASE MEASURES |',
  'SECTION 7. HANDLING AND STORAGE |',
  'SECTION 8. EXPOSURE CONTROLS/PERSONAL PROTECTION |',
  'SECTION 9. PHYSICAL AND CHEMICAL PROPERTIES |',
  'SECTION 10. STABILITY AND REACTIVITY |',
  'SECTION 11. TOXICOLOGICAL INFORMATION |',
  'SECTION 12. ECOLOGIC

In [13]:
candidate_headers = list()
for ht, hl in header_dict.items():
    if(len(hl)>=16):
        candidate_headers.append(ht)
candidate_headers

['<h2>', '<h9>']

In [14]:
scores = {}
for ch in candidate_headers:
    num_matches = 0
    for h in header_dict[ch]:
#         print(f"pattern: {pattern}")
        for pattern in standard_headers:
#             print(f"matching pattern {pattern} and header {h.lower().strip()}")
#             print(f"match? {bool(re.search(pattern, h.lower().strip()))}")
            if bool(re.search(pattern, h.lower().strip())):
                num_matches += 1
                break
    score = float(num_matches) / len(header_dict[ch])
    scores[ch] = score
scores

{'<h2>': 0.8421052631578947, '<h9>': 0.030303030303030304}

In [15]:
winner_h = sorted(scores.items(), key=itemgetter(1), reverse=True)[0]
header_dict[winner_h[0]]

['SECTION 1. PRODUCT AND COMPANY IDENTIFICATION |',
 'SECTION 2. HAZARDS IDENTIFICATION |',
 'SECTION 3. COMPOSITION/INFORMATION ON INGREDIENTS |',
 'SECTION 4. FIRST AID MEASURES |',
 'SECTION 5. FIRE-FIGHTING MEASURES |',
 'SECTION 6. ACCIDENTAL RELEASE MEASURES |',
 'SECTION 7. HANDLING AND STORAGE |',
 'SECTION 8. EXPOSURE CONTROLS/PERSONAL PROTECTION |',
 'SECTION 9. PHYSICAL AND CHEMICAL PROPERTIES |',
 'SECTION 10. STABILITY AND REACTIVITY |',
 'SECTION 11. TOXICOLOGICAL INFORMATION |',
 'SECTION 12. ECOLOGICAL INFORMATION |',
 'SECTION 13. DISPOSAL CONSIDERATIONS |',
 'SECTION 14. TRANSPORT INFORMATION |',
 'DOT |',
 'IATA |',
 'IMDG_US ',
 'SECTION 15. REGULATORY INFORMATION |',
 'SECTION 16. OTHER INFORMATION |']

In [23]:
def filter_headers(header_list, regex_list):
    output_list = header_list.copy()
    for h in header_list:
        match = False
        for pattern in regex_list:
#             print(f"matching pattern {pattern} and header {h.lower().strip()}")
#             print(f"match? {bool(re.search(pattern, h.lower().strip()))}")
            if bool(re.search(pattern, h.lower().strip())):
                match = True
        if not match:
            output_list.remove(h)
    return output_list                

In [24]:
filter_headers(header_dict[winner_h[0]], standard_headers)

['SECTION 1. PRODUCT AND COMPANY IDENTIFICATION |',
 'SECTION 2. HAZARDS IDENTIFICATION |',
 'SECTION 3. COMPOSITION/INFORMATION ON INGREDIENTS |',
 'SECTION 4. FIRST AID MEASURES |',
 'SECTION 5. FIRE-FIGHTING MEASURES |',
 'SECTION 6. ACCIDENTAL RELEASE MEASURES |',
 'SECTION 7. HANDLING AND STORAGE |',
 'SECTION 8. EXPOSURE CONTROLS/PERSONAL PROTECTION |',
 'SECTION 9. PHYSICAL AND CHEMICAL PROPERTIES |',
 'SECTION 10. STABILITY AND REACTIVITY |',
 'SECTION 11. TOXICOLOGICAL INFORMATION |',
 'SECTION 12. ECOLOGICAL INFORMATION |',
 'SECTION 13. DISPOSAL CONSIDERATIONS |',
 'SECTION 14. TRANSPORT INFORMATION |',
 'SECTION 15. REGULATORY INFORMATION |',
 'SECTION 16. OTHER INFORMATION |']