In [1]:
import pandas as pd
from __future__ import print_function
import fitz
import sys
from operator import itemgetter

In [2]:
#declare the path of your file
file_path = "../data/23114.pdf"  #/pdf_file/data.pdf

In [3]:
doc = fitz.open(file_path)

In [4]:
# def parse_to_html(pdf):
#     """
#     Parses pdf file to html object
#     Use filepath of pdf as argument
#     """
#     doc = fitz.open(pdf)
#     for page in doc:
#         html_content = page.getText("html")
#     return html_content

In [5]:
def get_font_style_counts(doc, granularity=False):
    """Extracts fonts and their usage in PDF documents.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
    :type granularity: bool
    :rtype: [(font_size, count), (font_size, count}], dict
    :return: most used fonts sorted by count, font style information
    """
    styles = {}
    font_counts = {}

    for page in doc:
        blocks = page.getText("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if granularity:
                            identifier = (int(s['text'].isupper()), int('bold' in s['font'].lower()), float(s['size'])) #"{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                            styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                  'color': s['color']}
                        else:
                            identifier = "{0}".format(s['size'])
                            styles[identifier] = {'size': s['size'], 'font': s['font']}

                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage
    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles

In [6]:
def get_font_tags(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.
    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict
    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = font_counts[0][0] # get style for most used font by count (paragraph)

    # sorting the font sizes high to low, so that we can append the right integer to each tag 
    font_styles = []
    for ((upper, bold, font_size), count) in font_counts:
        font_styles.append((upper, bold, font_size))
    font_styles.sort(key=itemgetter(0,2,1), reverse=True)
#     return font_styles

    # aggregating the tags for each font size
    idx = 0
    style_tag = {}
    for style in font_styles:
        idx += 1
        if style == p_style:
            idx = 0
            style_tag[style] = '<p>'
            continue
        if style[2] > p_style[2]:
            style_tag[style] = '<h{0}>'.format(idx)
        elif style[2] < p_style[2]:
            style_tag[style] = '<s{0}>'.format(idx)
        else:
            style_tag[style] = '<h{0}>'.format(idx)
    return style_tag

In [7]:
def assign_tags_to_content(doc, style_tag):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param style_tag: textual element tags for each style (uppercase_flag, bold_flag, size)
    :type style_tag: dict
    :rtype: list
    :return: texts with pre-prended element tags
    """
    header_para = []  # list with headers and paragraphs
    first = True  # boolean operator for first header
    previous_s = {}  # previous span

    for page in doc:
        blocks = page.getText("dict")["blocks"]
        i=0
        for b in blocks:  # iterate through the text blocks
#             i+=1
#             if i == 4:
#                 break
#             print(i)
#             pp.pprint(b)

            if b['type'] == 0:  # this block contains text

                # REMEMBER: multiple fonts and sizes are possible IN one block

                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['text'].strip():  # removing whitespaces:
                            if first:
                                previous_s = s
                                first = False
                                s_key = (int(s['text'].isupper()), int('bold' in s['font'].lower()), float(s['size']))
                                block_string = style_tag[s_key] + s['text']
                            else:
                                s_key = (int(s['text'].isupper()), int('bold' in s['font'].lower()), float(s['size']))
                                previous_key = (int(previous_s['text'].isupper()), int('bold' in previous_s['font'].lower()), float(previous_s['size']))
                                if s_key == previous_key:

                                    if block_string and all((c == "|") for c in block_string):
                                        # block_string only contains pipes
                                        block_string = style_tag[s_key] + s['text']
                                    if block_string == "":
                                        # new block has started, so append size tag
                                        block_string = style_tag[s_key] + s['text']
                                    else:  # in the same block, so concatenate strings
                                        block_string += " " + s['text']

                                else:
                                    header_para.append(block_string)
                                    block_string = style_tag[s_key] + s['text']

                                previous_s = s

                    # new block started, indicating with a pipe
                    block_string += "|"

                header_para.append(block_string)
    return header_para

In [8]:
font_counts, styles = get_font_style_counts(doc, granularity=True)

In [9]:
font_counts

[((0, 1, 8.7524995803833), 421),
 ((0, 0, 8.7524995803833), 412),
 ((0, 1, 9.744400024414062), 32),
 ((1, 0, 8.7524995803833), 29),
 ((0, 1, 7.76039981842041), 21),
 ((1, 1, 9.744400024414062), 16),
 ((0, 1, 3.90939998626709), 10),
 ((1, 1, 13.595399856567383), 10),
 ((0, 1, 13.595399856567383), 10),
 ((0, 1, 0.9919000267982483), 10),
 ((0, 1, 1.9254000186920166), 10),
 ((1, 1, 8.7524995803833), 9),
 ((0, 1, 9.219099998474121), 5),
 ((0, 0, 9.744400024414062), 3)]

In [10]:
style_tag = get_font_tags(font_counts, styles)
style_tag

{(1, 1, 13.595399856567383): '<h1>',
 (1, 1, 9.744400024414062): '<h2>',
 (1, 1, 8.7524995803833): '<h3>',
 (1, 0, 8.7524995803833): '<h4>',
 (0, 1, 13.595399856567383): '<h5>',
 (0, 1, 9.744400024414062): '<h6>',
 (0, 0, 9.744400024414062): '<h7>',
 (0, 1, 9.219099998474121): '<h8>',
 (0, 1, 8.7524995803833): '<p>',
 (0, 0, 8.7524995803833): '<h1>',
 (0, 1, 7.76039981842041): '<s2>',
 (0, 1, 3.90939998626709): '<s3>',
 (0, 1, 1.9254000186920166): '<s4>',
 (0, 1, 0.9919000267982483): '<s5>'}

In [11]:
tagged_text = assign_tags_to_content(doc, style_tag)
tagged_text

['|',
 '<h6>Material Safety Data Sheet ||',
 '|',
 '<h1>LUPEROX® P|',
 '|||',
 '|',
 '<p>Product code: 051000|',
 '|',
 '<p>Version 2.0| Issued on: 07/11/2011| Page: 1 / 10|',
 '|||||',
 '',
 '<h2>1. PRODUCT AND COMPANY IDENTIFICATION||',
 '<p>Company || Arkema Inc.| 900 First Avenue | King of Prussia, Pennsylvania 19406 || Functional Additives|| Customer Service Telephone Number: |',
 '<h1>(800) 331-7654 | (Monday through Friday, 8:30 AM to 5:30 PM EST) ||',
 '<p>Emergency Information || Transportation:|',
 '<h4>CHEMTREC: (800) 424-9300 |',
 '<h1>(24 hrs., 7 days a week) |',
 '<p>Medical: |',
 '<h1>Rocky Mountain Poison Center: (866) 767-5089 | (24 hrs., 7 days a week) ||',
 '<p>Product Information || Product name:|',
 '<h4>LUPEROX® P|',
 '',
 '<p>Synonyms: |',
 '<h1>Peroxyester, t-butyl perbenzoate, tert-butyl peroxybenzoate |',
 '<p>Molecular formula:|',
 '<h4>C11 H14 O3|',
 '<p>Chemical family:|',
 '<h1>Organic peroxide - peroxyesters|',
 '',
 '<p>Product use: |',
 '<h1>Initiator|'

In [12]:
headers = [i for i in tagged_text if i.startswith("<h2")]
headers

['<h2>1. PRODUCT AND COMPANY IDENTIFICATION||',
 '<h2>2. HAZARDS IDENTIFICATION||',
 '<h2>3. COMPOSITION/INFORMATION ON INGREDIENTS ||',
 '<h2>4. FIRST AID MEASURES ||',
 '<h2>5. FIRE-FIGHTING MEASURES ||',
 '<h2>6. ACCIDENTAL RELEASE MEASURES ||',
 '<h2>7. HANDLING AND STORAGE ||',
 '<h2>8. EXPOSURE CONTROLS/PERSONAL PROTECTION ||',
 '<h2>9. PHYSICAL AND CHEMICAL PROPERTIES ||',
 '<h2>10. STABILITY AND REACTIVITY ||',
 '<h2>11. TOXICOLOGICAL INFORMATION ||',
 '<h2>12. ECOLOGICAL INFORMATION ||',
 '<h2>13. DISPOSAL CONSIDERATIONS ||',
 '<h2>14. TRANSPORT INFORMATION ||',
 '<h2>15. REGULATORY INFORMATION ||',
 '<h2>16. OTHER INFORMATION ||']

In [13]:
headers = [i for i in tagged_text if i.startswith("1")] 

In [14]:
headers


[]