In [1]:
import pandas as pd
from __future__ import print_function
import fitz
import sys
from operator import itemgetter

In [2]:
#declare the path of your file
file_path = "../data/10N_Sodium_Hydroxide_NaOH_40_6_US_EN_sds.pdf"  #/pdf_file/data.pdf

In [3]:
doc = fitz.open(file_path)

In [4]:
# def parse_to_html(pdf):
#     """
#     Parses pdf file to html object
#     Use filepath of pdf as argument
#     """
#     doc = fitz.open(pdf)
#     for page in doc:
#         html_content = page.getText("html")
#     return html_content

In [5]:
def get_font_style_counts(doc, granularity=False):
    """Extracts fonts and their usage in PDF documents.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
    :type granularity: bool
    :rtype: [(font_size, count), (font_size, count}], dict
    :return: most used fonts sorted by count, font style information
    """
    styles = {}
    font_counts = {}

    for page in doc:
        blocks = page.getText("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if granularity:
                            identifier = (int(s['text'].isupper()), int('bold' in s['font'].lower()), float(s['size'])) #"{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                            styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                  'color': s['color']}
                        else:
                            identifier = "{0}".format(s['size'])
                            styles[identifier] = {'size': s['size'], 'font': s['font']}

                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage
    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles

In [6]:
def get_font_tags(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.
    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict
    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = font_counts[0][0] # get style for most used font by count (paragraph)

    # sorting the font sizes high to low, so that we can append the right integer to each tag 
    font_styles = []
    for ((upper, bold, font_size), count) in font_counts:
        font_styles.append((upper, bold, font_size))
    font_styles.sort(key=itemgetter(0,2,1), reverse=True)
#     return font_styles

    # aggregating the tags for each font size
    idx = 0
    style_tag = {}
    for style in font_styles:
        idx += 1
        if style == p_style:
            idx = 0
            style_tag[style] = '<p>'
            continue
        if style[2] > p_style[2]:
            style_tag[style] = '<h{0}>'.format(idx)
        elif style[2] < p_style[2]:
            style_tag[style] = '<s{0}>'.format(idx)
        else:
            style_tag[style] = '<h{0}>'.format(idx)
    return style_tag

In [7]:
def assign_tags_to_content(doc, style_tag):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param style_tag: textual element tags for each style (uppercase_flag, bold_flag, size)
    :type style_tag: dict
    :rtype: list
    :return: texts with pre-prended element tags
    """
    header_para = []  # list with headers and paragraphs
    first = True  # boolean operator for first header
    previous_s = {}  # previous span

    for page in doc:
        blocks = page.getText("dict")["blocks"]
        i=0
        for b in blocks:  # iterate through the text blocks
#             i+=1
#             if i == 4:
#                 break
#             print(i)
#             pp.pprint(b)

            if b['type'] == 0:  # this block contains text

                # REMEMBER: multiple fonts and sizes are possible IN one block

                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['text'].strip():  # removing whitespaces:
                            if first:
                                previous_s = s
                                first = False
                                s_key = (int(s['text'].isupper()), int('bold' in s['font'].lower()), float(s['size']))
                                block_string = style_tag[s_key] + s['text']
                            else:
                                s_key = (int(s['text'].isupper()), int('bold' in s['font'].lower()), float(s['size']))
                                previous_key = (int(previous_s['text'].isupper()), int('bold' in previous_s['font'].lower()), float(previous_s['size']))
                                if s_key == previous_key:

                                    if block_string and all((c == "|") for c in block_string):
                                        # block_string only contains pipes
                                        block_string = style_tag[s_key] + s['text']
                                    if block_string == "":
                                        # new block has started, so append size tag
                                        block_string = style_tag[s_key] + s['text']
                                    else:  # in the same block, so concatenate strings
                                        block_string += " " + s['text']

                                else:
                                    header_para.append(block_string)
                                    block_string = style_tag[s_key] + s['text']

                                previous_s = s

                    # new block started, indicating with a pipe
                    block_string += "|"

                header_para.append(block_string)
    return header_para

In [8]:
font_counts, styles = get_font_style_counts(doc, granularity=True)

In [9]:
font_counts

[((0, 0, 10.020000457763672), 497),
 ((0, 1, 10.020000457763672), 206),
 ((0, 0, 7.980000019073486), 33),
 ((0, 0, 32.77130126953125), 30),
 ((1, 0, 10.020000457763672), 30),
 ((0, 1, 12.0), 25),
 ((1, 1, 10.020000457763672), 15),
 ((0, 1, 13.979999542236328), 11),
 ((0, 0, 9.0), 11),
 ((0, 0, 1.0199999809265137), 11),
 ((1, 0, 6.0), 11),
 ((0, 0, 12.0), 9),
 ((0, 0, 15.0), 3),
 ((1, 0, 7.019999980926514), 3),
 ((0, 0, 7.019999980926514), 3),
 ((0, 0, 8.991100311279297), 3),
 ((1, 0, 32.77130126953125), 2)]

In [10]:
style_tag = get_font_tags(font_counts, styles)
style_tag

{(1, 0, 32.77130126953125): '<h1>',
 (1, 1, 10.020000457763672): '<h2>',
 (1, 0, 10.020000457763672): '<h3>',
 (1, 0, 7.019999980926514): '<s4>',
 (1, 0, 6.0): '<s5>',
 (0, 0, 32.77130126953125): '<h6>',
 (0, 0, 15.0): '<h7>',
 (0, 1, 13.979999542236328): '<h8>',
 (0, 1, 12.0): '<h9>',
 (0, 0, 12.0): '<h10>',
 (0, 1, 10.020000457763672): '<h11>',
 (0, 0, 10.020000457763672): '<p>',
 (0, 0, 9.0): '<s1>',
 (0, 0, 8.991100311279297): '<s2>',
 (0, 0, 7.980000019073486): '<s3>',
 (0, 0, 7.019999980926514): '<s4>',
 (0, 0, 1.0199999809265137): '<s5>'}

In [11]:
tagged_text = assign_tags_to_content(doc, style_tag)
tagged_text

['|',
 '<p>Page 1/11|',
 '<h8>Safety Data Sheet (SDS)|',
 '<s1>OSHA HazCom Standard 29 CFR 1910.1200(g) and GHS Rev 03.|',
 '',
 '<p>Issue date 02/09/2017| Reviewed on 02/09/2017|',
 '',
 '<s5>44.2.1|',
 '',
 '<p>* |',
 '<h9>1 Identification|',
 '',
 '<p>· ',
 '<h11>Product Identifier|',
 '',
 '<p>· ',
 '<h11>Trade name: 10N Sodium Hydroxide (NaOH 40%)|',
 '<p>· ',
 '<h11>Product Number:',
 '<p> NGT-10N NaOH| · ',
 '<h11>Relevant identified uses of the substance or mixture and uses advised against:|',
 '<p>No further relevant information available.| · ',
 '<h11>Product Description',
 '<p> PC21   Laboratory chemicals| · ',
 '<h11>Application of the substance / the mixture:',
 '<p> Laboratory chemicals|',
 '<p>· ',
 '<h11>Details of the Supplier of the Safety Data Sheet:|',
 '<p>· ',
 '<h11>Manufacturer/Supplier:|',
 '<p>NuGeneration Technologies, LLC (dba NuGenTec)| 1155 Park Avenue, Emeryville, CA 94608| salesteam@nugentec.com| www.nugentec.com| 1-888-996-8436 or 1-707-820-4080 for pro

In [15]:
headers = [i for i in tagged_text if i.startswith("<h9")]
headers

['<h9>1 Identification|',
 '<h9>2 Hazard(s) Identification|',
 '<h9>3 Composition/Information on Ingredients|',
 '<h9>4 First-Aid Measures|',
 '<h9>5 Fire-Fighting Measures|',
 '<h9>6 Accidental Release Measures|',
 '<h9>7 Handling and Storage|',
 '<h9>8 Exposure Controls/Personal Protection|',
 '<h9>9 Physical and Chemical Properties|',
 '<h9>10 Stability and Reactivity|',
 '<h9>11 Toxicological Information|',
 '<h9>12 Ecological Information|',
 '<h9>13 Disposal Considerations|',
 '<h9>14 Transport Information|',
 '<h9>15 Regulatory Information|',
 '<h9>16 Other Information|']

In [13]:
header_para

NameError: name 'header_para' is not defined