In [1]:
import pandas as pd
from __future__ import print_function
import fitz
import sys
from operator import itemgetter
import re

In [2]:
#declare the path of your file
file_path = "../data/Savinyl-Orange-RLS.pdf"  #/pdf_file/data.pdf
standard_headers = [
    r"(\d+.*identification)",
    r"(\d+.*hazard)",
    r"(\d+.*composition)",
    r"(\d+.*first.aid)",
    r"(\d+.*fire.fighting)",
    r"(\d+.*accidental release)",
    r"(\d+.*handling)",
    r"(\d+.*exposure)",
    r"(\d+.*physical and chemical)",
    r"(\d+.*stability and reactivity)",
    r"(\d+.*toxicological)",
    r"(\d+.*ecological)",
    r"(\d+.*disposal)",
    r"(\d+.*transport)",
    r"(\d+.*regulatory)",
    r"(\d+.*other information)"
]

In [3]:
doc = fitz.open(file_path)

In [4]:
# def parse_to_html(pdf):
#     """
#     Parses pdf file to html object
#     Use filepath of pdf as argument
#     """
#     doc = fitz.open(pdf)
#     for page in doc:
#         html_content = page.getText("html")
#     return html_content

In [5]:
def get_font_style_counts(doc, granularity=False):
    """Extracts fonts and their usage in PDF documents.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
    :type granularity: bool
    :rtype: [(font_size, count), (font_size, count}], dict
    :return: most used fonts sorted by count, font style information
    """
    styles = {}
    font_counts = {}

    for page in doc:
        blocks = page.getText("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if granularity:
                            identifier = (int(s['text'].isupper()), int('bold' in s['font'].lower()), float(s['size'])) #"{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                            styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                  'color': s['color']}
                        else:
                            identifier = "{0}".format(s['size'])
                            styles[identifier] = {'size': s['size'], 'font': s['font']}

                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage
    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles

In [6]:
def get_font_tags(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.
    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict
    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = font_counts[0][0] # get style for most used font by count (paragraph)

    # sorting the font sizes high to low, so that we can append the right integer to each tag 
    font_styles = []
    for ((upper, bold, font_size), count) in font_counts:
        font_styles.append((upper, bold, font_size))
    font_styles.sort(key=itemgetter(0,2,1), reverse=True)
#     return font_styles

    # aggregating the tags for each font size
    idx = 0
    style_tag = {}
    for style in font_styles:
        idx += 1
        if style == p_style:
            idx = 0
            style_tag[style] = '<p>'
            continue
        if style[2] > p_style[2]:
            style_tag[style] = '<h{0}>'.format(idx)
        elif style[2] < p_style[2]:
            style_tag[style] = '<s{0}>'.format(idx)
        else:
            style_tag[style] = '<h{0}>'.format(idx)
    return style_tag

In [7]:
def assign_tags_to_content(doc, style_tag):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param style_tag: textual element tags for each style (uppercase_flag, bold_flag, size)
    :type style_tag: dict
    :rtype: list
    :return: texts with pre-prended element tags
    """
    header_para = []  # list with headers and paragraphs
    header_dict = {}
    first = True  # boolean operator for first header
    previous_s = {}  # previous span

    for page in doc:
        blocks = page.getText("dict")["blocks"]
        i=0
        for b in blocks:  # iterate through the text blocks
#             i+=1
#             if i == 4:
#                 break
#             print(i)
#             pp.pprint(b)

            if b['type'] == 0:  # this block contains text

                # REMEMBER: multiple fonts and sizes are possible IN one block

                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['text'].strip():  # removing whitespaces:
                            if first:
                                previous_s = s
                                first = False
                                s_key = (int(s['text'].isupper()), int('bold' in s['font'].lower()), float(s['size']))
                                block_string = style_tag[s_key] + s['text']
                            else:
                                s_key = (int(s['text'].isupper()), int('bold' in s['font'].lower()), float(s['size']))
                                previous_key = (int(previous_s['text'].isupper()), int('bold' in previous_s['font'].lower()), float(previous_s['size']))
                                if s_key == previous_key:

                                    if block_string and all((c == "|") for c in block_string):
                                        # block_string only contains pipes
                                        block_string = style_tag[s_key] + s['text']
                                    if block_string == "":
                                        # new block has started, so append size tag
                                        block_string = style_tag[s_key] + s['text']
                                    else:  # in the same block, so concatenate strings
                                        block_string += " " + s['text']

                                else:
                                    header_para.append(block_string)
                                    if block_string.startswith("<h"):
                                        if style_tag[previous_key] in header_dict: 
                                            header_dict[style_tag[previous_key]].append(block_string[block_string.index(">")+1:])
                                        else:
                                            header_dict[style_tag[previous_key]] = [block_string[block_string.index(">")+1:]]
                                    block_string = style_tag[s_key] + s['text']

                                previous_s = s

                    # new block started, indicating with a pipe
                    block_string += "|"

                header_para.append(block_string)
                if block_string.startswith("<h"):
                    if style_tag[s_key] in header_dict: 
                        header_dict[style_tag[s_key]].append(block_string[block_string.index(">")+1:])
                    else:
                        header_dict[style_tag[s_key]] = [block_string[block_string.index(">")+1:]]
    return header_para, header_dict

In [8]:
font_counts, styles = get_font_style_counts(doc, granularity=True)

In [9]:
font_counts

[((0, 0, 9.956937789916992), 349),
 ((0, 1, 9.956937789916992), 179),
 ((0, 0, 5.03837776184082), 93),
 ((0, 1, 11.996455192565918), 25),
 ((0, 1, 15.955316543579102), 16),
 ((0, 0, 11.996455192565918), 16),
 ((0, 0, 0.9595194458961487), 15),
 ((1, 0, 9.956937789916992), 13),
 ((0, 0, 21.953685760498047), 8),
 ((0, 1, 5.03837776184082), 6),
 ((0, 0, 2.039278745651245), 5),
 ((1, 1, 9.956937789916992), 5)]

In [10]:
style_tag = get_font_tags(font_counts, styles)
style_tag

{(1, 1, 9.956937789916992): '<h1>',
 (1, 0, 9.956937789916992): '<h2>',
 (0, 0, 21.953685760498047): '<h3>',
 (0, 1, 15.955316543579102): '<h4>',
 (0, 1, 11.996455192565918): '<h5>',
 (0, 0, 11.996455192565918): '<h6>',
 (0, 1, 9.956937789916992): '<h7>',
 (0, 0, 9.956937789916992): '<p>',
 (0, 1, 5.03837776184082): '<s1>',
 (0, 0, 5.03837776184082): '<s2>',
 (0, 0, 2.039278745651245): '<s3>',
 (0, 0, 0.9595194458961487): '<s4>'}

In [11]:
tagged_text, header_dict = assign_tags_to_content(doc, style_tag)
tagged_text

['<h4>Safety Data Sheet in accordance with Regulation (EU) | No.453/2010|',
 '<h5>Savinyl Orange RLS|',
 '<p>Page  1(8)  |',
 '<p>Substance key: KS7032   Substance key: KS7032  | Revision Date: 25.11.2010  |',
 '<p>Version : 1 - 5 / EU | Date of printing : 21.06.2011 |',
 '||',
 '|',
 '',
 '<h5>SECTION 1: Identification of the substance/mixture and of the | company/undertaking||',
 '<h7>1.1. Product identifier ||',
 '<h7>Trade name ||',
 '<h7>Savinyl Orange RLS|| Material number:',
 '<p> 103327||',
 '|',
 '<h7>1.2. Relevant identified uses of the substance or mixture and uses advised against || Relevant identified uses of the substance or mixture |',
 '<p>Industry sector :| Paints, lacquers and varnishes industry | Type of use :| dye for special industries ||',
 '<h7>1.3. Details of the supplier of the safety data sheet || Identification of the company |',
 '<p>Clariant Production (France)  | Usine de Huningue  | Avenue de Bâle  | 68331 Huningue  | Telephone no. : +33 3 89 89 60 00  ||

In [12]:
header_dict

{'<h4>': ['Safety Data Sheet in accordance with Regulation (EU) | No.453/2010|',
  'Safety Data Sheet in accordance with Regulation (EU) | No.453/2010|',
  'Safety Data Sheet in accordance with Regulation (EU) | No.453/2010|',
  'Safety Data Sheet in accordance with Regulation (EU) | No.453/2010|',
  'Safety Data Sheet in accordance with Regulation (EU) | No.453/2010|',
  'Safety Data Sheet in accordance with Regulation (EU) | No.453/2010|',
  'Safety Data Sheet in accordance with Regulation (EU) | No.453/2010|',
  'Safety Data Sheet in accordance with Regulation (EU) | No.453/2010|'],
 '<h5>': ['Savinyl Orange RLS|',
  'SECTION 1: Identification of the substance/mixture and of the | company/undertaking||',
  'SECTION 2: Hazards identification||',
  'SECTION 3: Composition/information on ingredients||',
  'Savinyl Orange RLS|',
  'SECTION 4: First aid measures||',
  'SECTION 5: Firefighting measures||',
  'Savinyl Orange RLS|',
  'SECTION 6: Accidental release measures||',
  'SECTION 7

In [13]:
def get_candidate_tags(header_dict):
    candidate_tags = list()
    for ht, hl in header_dict.items():
        if(len(hl)>=16):
            candidate_tags.append(ht)
    return candidate_tags

In [14]:
candidate_tags = get_candidate_tags(header_dict)
candidate_tags

['<h5>', '<h7>']

In [15]:
header_dict['<h7>']

['1.1. Product identifier ||',
 'Trade name ||',
 'Savinyl Orange RLS|| Material number:',
 '1.2. Relevant identified uses of the substance or mixture and uses advised against || Relevant identified uses of the substance or mixture |',
 '1.3. Details of the supplier of the safety data sheet || Identification of the company |',
 'Information about the substance/mixture |',
 '1.4. Emergency telephone number ||',
 '2.1. Classification of the substance or mixture || 2.2. Label elements || Labelling according CLP regulation (Regulation (EC) No. 1272/2008, as amended) |',
 '2.3. Other hazards ||',
 '3.1. Substances || Chemical characterization  |',
 '4.1. Description of first aid measures || General information |',
 'After inhalation |',
 'After contact with skin |',
 'After contact with eyes |',
 'After ingestion |',
 '4.2. Most important symptoms and effects, both acute and delayed || Symptoms |',
 'Hazards |',
 '4.3. Indication of any immediate medical attention and special treatment need

In [16]:
def score_candidate_tags(candidate_tags, header_dict, standard_headers):
    scores = {}
    for ct in candidate_tags:
        num_matches = 0
        for h in header_dict[ct]:
    #         print(f"pattern: {pattern}")
            for pattern in standard_headers:
    #             print(f"matching pattern {pattern} and header {h.lower().strip()}")
    #             print(f"match? {bool(re.search(pattern, h.lower().strip()))}")
                if bool(re.search(pattern, h.lower().strip())):
                    num_matches += 1
                    break
        score = num_matches / len(header_dict[ct])
        scores[ct] = score
    return scores

In [17]:
scores = score_candidate_tags(candidate_tags, header_dict, standard_headers)
scores

{'<h5>': 0.625, '<h7>': 0.1346153846153846}

In [18]:
def get_section_header_list(scores, header_dict): #get the first pair of the dictionary which has the winner tag (h#) as key, and all the titles that have that tag as value
    section_header_tag = sorted(scores.items(), key=itemgetter(1), reverse=True)[0][0]  
    return header_dict[section_header_tag]

In [19]:
section_header_list = get_section_header_list(scores, header_dict)
section_header_list

['Savinyl Orange RLS|',
 'SECTION 1: Identification of the substance/mixture and of the | company/undertaking||',
 'SECTION 2: Hazards identification||',
 'SECTION 3: Composition/information on ingredients||',
 'Savinyl Orange RLS|',
 'SECTION 4: First aid measures||',
 'SECTION 5: Firefighting measures||',
 'Savinyl Orange RLS|',
 'SECTION 6: Accidental release measures||',
 'SECTION 7: Handling and storage||',
 'SECTION 8: Exposure controls/personal protection||',
 'Savinyl Orange RLS|',
 'SECTION 9: Physical and chemical properties||',
 'Savinyl Orange RLS|',
 'SECTION 10: Stability and reactivity||',
 'Savinyl Orange RLS|',
 'SECTION 11: Toxicological information||',
 'SECTION 12: Ecological information||',
 'Savinyl Orange RLS|',
 'SECTION 13: Disposal considerations||',
 'SECTION 14: Transport information||',
 'Savinyl Orange RLS|',
 'SECTION 15: Regulatory information||',
 'SECTION 16: Other information||||']

In [50]:
def filter_headers(header_list, standard_headers):
    output_list = header_list.copy()
    for h in header_list:
        match = False
        for pattern in standard_headers:
#             print(f"matching pattern {pattern} and header {h.lower().strip()}")
#             print(f"match? {bool(re.search(pattern, h.lower().strip()))}")
            if bool(re.search(pattern, h.lower().strip())):
                match = True
                break
        if not match:
            output_list.remove(h)
    return output_list                

In [51]:
output_list = filter_headers(section_header_list, standard_headers)
output_list

['SECTION 1: Identification of the substance/mixture and of the | company/undertaking||',
 'SECTION 2: Hazards identification||',
 'SECTION 3: Composition/information on ingredients||',
 'SECTION 4: First aid measures||',
 'SECTION 6: Accidental release measures||',
 'SECTION 7: Handling and storage||',
 'SECTION 8: Exposure controls/personal protection||',
 'SECTION 9: Physical and chemical properties||',
 'SECTION 10: Stability and reactivity||',
 'SECTION 11: Toxicological information||',
 'SECTION 12: Ecological information||',
 'SECTION 13: Disposal considerations||',
 'SECTION 14: Transport information||',
 'SECTION 15: Regulatory information||',
 'SECTION 16: Other information||||']

In [53]:
def get_dict_header_content(): 
    get_section_header_list(scores, header_dict)
    #section_header_tag = sorted(scores.items(), key=itemgetter(1), reverse=True)[0][0]
    
    dict_section = {}

    for i in range(len(output_list)):        
        header_pos_current = tagged_text.index(f'{section_header_tag}{output_list[i]}')

        if i < len(output_list)-1:
            header_pos_next = tagged_text.index(f'{section_header_tag}{output_list[i+1]}')
        else:
            header_pos_next = len(tagged_text)-1

        content = tagged_text[header_pos_current+1:header_pos_next+1]

        dict_section[output_list[i]] = content  
        return dict_section

In [54]:
dict_section

{'SECTION 1: Identification of the substance/mixture and of the | company/undertaking||': ['<h7>1.1. Product identifier ||',
  '<h7>Trade name ||',
  '<h7>Savinyl Orange RLS|| Material number:',
  '<p> 103327||',
  '|',
  '<h7>1.2. Relevant identified uses of the substance or mixture and uses advised against || Relevant identified uses of the substance or mixture |',
  '<p>Industry sector :| Paints, lacquers and varnishes industry | Type of use :| dye for special industries ||',
  '<h7>1.3. Details of the supplier of the safety data sheet || Identification of the company |',
  '<p>Clariant Production (France)  | Usine de Huningue  | Avenue de Bâle  | 68331 Huningue  | Telephone no. : +33 3 89 89 60 00  ||',
  '<h7>Information about the substance/mixture |',
  '<p>Division Pigments & Additives  | tel.: +33.3.89.89.63.38  | e-mail: France.ProductSafety@clariant.com ||',
  '<h7>1.4. Emergency telephone number ||',
  '<p>+33 1 45 42 59 59   (24 h)|||',
  '<h5>SECTION 2: Hazards identificat

In [40]:
section_header_tag = sorted(scores.items(), key=itemgetter(1), reverse=True)[0][0]

header_pos = tagged_text.index(f'{section_header_tag}SECTION 1: Identification of the substance/mixture and of the | company/undertaking||')
header_pos

8

In [26]:
tagged_text.index(f'{section_header_tag}SECTION 2: Hazards identification||')


22