In [1]:
import pandas as pd
from __future__ import print_function
import fitz
import sys
from operator import itemgetter

In [2]:
#declare the path of your file
file_path = "1.pdf"  #/pdf_file/data.pdf

In [3]:
doc = fitz.open(file_path)

In [4]:
def get_font_style_counts(doc, granularity=False):
    """Extracts fonts and their usage in PDF documents.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
    :type granularity: bool
    :rtype: [(font_size, count), (font_size, count}], dict
    :return: most used fonts sorted by count, font style information
    """
    styles = {}
    font_counts = {}

    for page in doc:
        blocks = page.getText("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines                    
                    for s in l["spans"]:  # iterate through the text spans                    
                        if granularity:
                            identifier = (int(s['text'].isupper()), int('bold' in s['font'].lower()), float(s['size'])) #"{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                            styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                  'color': s['color']}
                        else:
                            identifier = "{0}".format(s['size'])
                            styles[identifier] = {'size': s['size'], 'font': s['font']}

                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage
    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")
    
    return font_counts, styles

In [5]:
def get_font_tags(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.
    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict
    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = font_counts[0][0] # get style for most used font by count (paragraph)

    # sorting the font sizes high to low, so that we can append the right integer to each tag 
    font_styles = []
    for ((upper, bold, font_size), count) in font_counts:
        font_styles.append((upper, bold, font_size))
    font_styles.sort(key=itemgetter(0,2,1), reverse=True)
#     return font_styles

    # aggregating the tags for each font size
    idx = 0
    style_tag = {}
    for style in font_styles:
        idx += 1
        if style == p_style:
            idx = 0
            style_tag[style] = '<p>'
            continue
        if style[2] > p_style[2]:
            style_tag[style] = '<h{0}>'.format(idx)
        elif style[2] < p_style[2]:
            style_tag[style] = '<s{0}>'.format(idx)
        else:
            style_tag[style] = '<h{0}>'.format(idx)    
    return style_tag

In [6]:
def assign_tags_to_content(doc, style_tag):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param style_tag: textual element tags for each style (uppercase_flag, bold_flag, size)
    :type style_tag: dict
    :rtype: list
    :return: texts with pre-prended element tags
    """
    header_para = []  # list with headers and paragraphs
    first = True  # boolean operator for first header
    previous_s = {}  # previous span

    for page in doc:
        blocks = page.getText("dict")["blocks"]
        i=0
        for b in blocks:  # iterate through the text blocks
#             i+=1
#             if i == 4:
#                 break
#             print(i)
#             pp.pprint(b)

            if b['type'] == 0:  # this block contains text

                # REMEMBER: multiple fonts and sizes are possible IN one block

                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['text'].strip():  # removing whitespaces:
                            if first:
                                previous_s = s
                                first = False
                                s_key = (int(s['text'].isupper()), int('bold' in s['font'].lower()), float(s['size']))
                                block_string = style_tag[s_key] + s['text']
                            else:
                                s_key = (int(s['text'].isupper()), int('bold' in s['font'].lower()), float(s['size']))
                                previous_key = (int(previous_s['text'].isupper()), int('bold' in previous_s['font'].lower()), float(previous_s['size']))
                                if s_key == previous_key:

                                    if block_string and all((c == "|") for c in block_string):
                                        # block_string only contains pipes
                                        block_string = style_tag[s_key] + s['text']
                                    if block_string == "":
                                        # new block has started, so append size tag
                                        block_string = style_tag[s_key] + s['text']
                                    else:  # in the same block, so concatenate strings
                                        block_string += " " + s['text']

                                else:
                                    if block_string:
                                        header_para.append(block_string.replace('|','').strip(".:: ·"))
                                    block_string = style_tag[s_key] + s['text']

                                previous_s = s

                    # new block started, indicating with a pipe
                    block_string += "|"

                if block_string:
                    header_para.append(block_string.replace('|','').strip(".:: ·"))
    return header_para

In [7]:
font_counts, styles = get_font_style_counts(doc, granularity=True)

In [8]:
style_tag = get_font_tags(font_counts, styles)
style_tag

{(1, 0, 32.77130126953125): '<h1>',
 (1, 1, 10.020000457763672): '<h2>',
 (1, 0, 10.020000457763672): '<h3>',
 (1, 0, 7.019999980926514): '<s4>',
 (1, 0, 6.0): '<s5>',
 (0, 0, 32.77130126953125): '<h6>',
 (0, 0, 15.0): '<h7>',
 (0, 1, 13.979999542236328): '<h8>',
 (0, 1, 12.0): '<h9>',
 (0, 0, 12.0): '<h10>',
 (0, 1, 10.020000457763672): '<h11>',
 (0, 0, 10.020000457763672): '<p>',
 (0, 0, 9.0): '<s1>',
 (0, 0, 8.991100311279297): '<s2>',
 (0, 0, 7.980000019073486): '<s3>',
 (0, 0, 7.019999980926514): '<s4>',
 (0, 0, 1.0199999809265137): '<s5>'}

In [9]:
tagged_text = assign_tags_to_content(doc, style_tag)


In [10]:
import re
#keywords per section title
section_titles={
    1:['1','identification'],
    2:['2','hazard','identification'],
    3:['3','composition','ingredients'],
    4:['4','first','aid','measures'],
    5:['5','fire','fight','measures'],
    6:['6','accidental','release','measures'],
    7:['7','handling','storage'],
    8:['8','exposure','controls','personal','protection'],
    9:['9','physical','chemical','properties'],
    10:['10','stability','reactivity'],
    11:['11','information'],
    12:['12','ecological','information'],
    13:['13','disposal','considerations'],
    14:['14','transport','information'],
    15:['15','regulatory','information'],
    16:['16','other','information'],
}

def convert_tagged_text_into_map(text):        
    header_map={}
    for line in text:
        header=extract_header(line)
        if header!=None:
            if header_map.get(header)==None:
                header_map[header]=[line.lower()]
            else:
                header_map[header].append(line.lower())
    return header_map

def find_section_header(header_dict):
    for key in header_map.keys():
        if check_section_titles_present(header_dict[key])==True:
            return key
    return None

#turns a list into a regexp, use headervalue to pass a specific header
def sectiontitle_regexp(values,headervalue="<hx>"):
    if headervalue=="<hx>":
        regexp="<h\d+>.*"
    else:
        regexp=headervalue+".*"
    for i in values:
        regexp=regexp+i+".*"    
    return regexp

#extracts '<hx>'
def extract_header(line,only_h=True):
    if only_h==True:
        regexp="<h\d+>"
        x=re.search(regexp, line)
        if x!=None:
            return x.group()
        else:
            return None
    else:
        regexp="<.+>"
        x=re.search(regexp, line)
        if x!=None:
            return x.group()
        else:
            return None
    
#checks if all 16 section_titles are present in the list
def check_section_titles_present(values):
    counter=1    
    while counter<17:
        r = re.compile(sectiontitle_regexp(section_titles[counter]))
        newlist = list(filter(r.match, values))
        if len(newlist)==0:
            return False
        counter=counter+1        
    return True
    
header_map=convert_tagged_text_into_map(tagged_text)
section_title_header=find_section_header(header_map)

In [11]:
section_title_header

'<h9>'

In [12]:
def convert_text_into_dict(tagged_text,section_title_header):
    counter=1
    next_child_regexp=sectiontitle_regexp(section_titles[counter],section_title_header)
    current_child="Header"
    temp_dict={"Header":[]}
    for line in tagged_text:
        #start of file and no child yet found
        x=re.search(next_child_regexp, line.lower())
        #found a child
        if x!=None:
            current_child=line
            temp_dict[current_child]=[]
            if counter<16:
                counter=counter+1
                next_child_regexp=sectiontitle_regexp(section_titles[counter],section_title_header)                
        else:
#             cleaned_line = re.sub(r'<[h|s]\d+>|<p>', '', line)
#             if cleaned_line:
#                 temp_dict[current_child].append(cleaned_line)
            temp_dict[current_child].append(line)        
    return temp_dict

def extract_headers_from_list(values):
    headers=[]
    for i in values:
        if i!=None:
            headers.append(extract_header(i,False))
    return list(set(headers))


def get_next_smaller_header(base_header,values):
    all_headers=extract_headers_from_list(values)
    #remove None values
    base_header=base_header.replace('<h','')
    base_header=int(base_header.replace('>',''))
    filtered=list(filter(None.__ne__, values))
    header_filtered=list(filter(lambda x : 'h' in x, filtered))
    debracketed_list=[]
    for item in header_filtered:
        debracketed=item.replace('<h','')
        debracketed=debracketed.replace('>','')
        debracketed=int(debracketed)
        debracketed_list.append(debracketed)
    smaller_headings=list(filter(lambda x : x > base_header,debracketed_list))    
    if len(smaller_headings)==0:
        return '<hx>'
    else:
        smaller_headings.sort()
        return '<h'+str(smaller_headings[-1])+'>'
    
#returns list[pairs] met de pairs(startidx,laatsteregel voor volgende) 
def record_span(values,header):
    counter=0
    children=[]
    for line in values:
        if header in line:
            children.append(counter)
        counter=counter+1
    return children

def convert_children_into_parents(values,parent_locations):
    counter=0
    temp_list=[]
    while counter < parent_locations[0]:
        temp_list.append(values[counter])
        counter=counter+1
    parent_counter=0
    current_parent_idx=parent_locations[0]    
    while parent_counter<len(parent_locations)-1:
        
        next_parent_idx=parent_locations[parent_counter+1]    
        temp_list.append({values[current_parent_idx]:values[current_parent_idx+1:next_parent_idx]})
        current_parent_idx=next_parent_idx
        parent_counter=parent_counter+1
    temp_list.append({values[current_parent_idx]:values[current_parent_idx+1:]})
    return temp_list
        
def scourge(end_dict):
    temp_dict={}
    
    key=list(end_dict.keys())[0]
    header=extract_header(key,only_h=True)
    if header==None:
        return end_dict
        print("Got passed something is a dict ,but doesnt have a header?",key)
        raise Exception
    else:
        child_header=get_next_smaller_header(header,extract_headers_from_list(end_dict[key]))
        if child_header=='<hx>':
            #no possible subdivisions, return original dict
            return end_dict
        else:
            #subdivisions possible
            parent_corpus=end_dict[key]
            next_gen_parents=record_span(parent_corpus,child_header)
            if len(next_gen_parents)>0:
                temp_dict[key]=convert_children_into_parents(parent_corpus,next_gen_parents)
                temp_list=[]
                for val in temp_dict[key]:
                    if isinstance(val,dict):
                        temp_list.append(scourge(val))
                    else:
#                         print(f"type({val}): {type(val)}")
#                         cleaned_val = re.sub(r'<[h|s]\d+>|<p>', '', val)
#                         temp_list.append(cleaned_val)
                        temp_list.append(val)
                temp_dict[key]=temp_list
            return temp_dict
    
def clean_dictionary(curr_dict):
    clean_dict = {}
    for key, val in curr_dict.items():
        new_key = re.sub(r'<[h|s]\d+>|<p>', '', key)
        new_val = []
        for v in val:
            if isinstance(v, dict):
                new_val.append(clean_dictionary(v))
            else:
                clean_v = re.sub(r'<[h|s]\d+>|<p>|\*', '', v).strip()
                if clean_v:
                    new_val.append(clean_v)
        clean_dict[new_key] = new_val
    return clean_dict    

temp_dict=convert_text_into_dict(tagged_text,find_section_header(header_map))
temp_list=[]
for k,v in temp_dict.items():
    temp_list.append(clean_dictionary(scourge({k:v})))        

In [15]:
import json 
with open('data.json', 'w') as f:
    json.dump(temp_list, f)