# 0) Enter Variables

In [None]:
# set up the files
# THESE ARE THE ONLY 3 THINGS YOU NEED TO CHANGE 

base_path = "C:/Users/matt/Jesser Analytics Consulting/Natsco Transit Solutions/python/"
filename = 'NOVA_L998'
pg = {'pgstart': 39, 'pgend':1431}
    #NOVA_L998 ### 39 - 1431
    #NF_SR2089 ### 91 - 1451
    #NF_SR1600 ### 79 - 1409
    
prm = {'cm':200.0, 'lm':0.1, 'wm':0.1, 'bf':1.0, 'lo':0.9} #defaults: {cm=2.0, lm=0.5, wm=0.1, bf=0.5, lo=0.5}
out = {'header':0,'subheader':1,'bodystart':3, 'spaces_desc':3}

####################################################

# setup additional variables
f = f"{filename}.pdf"
path = f"{base_path}{filename}"

# setup your pages
p = pgprep(pgstart=pg['pgstart'], pgend=pg['pgend'])
print(p['message'])

# 1) Run the Program

In [None]:
# initialize my three dataframes for logs, summary logs, and final dataset
l_col=['file','page','charlen','char', 'pos_BLx', 'pos_BLy', 'pos_TRx', 'pos_TRy']
s_col=['filename', 'page', 'loglen', 'top', 'bottom']
d_col=['file', 'page', 'header', 'subheader', 'localnum',  'revnum', 'pagepos', 'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col_notes']

dfl = pd.DataFrame(columns=l_col)
dfs = pd.DataFrame(columns=s_col)
df = pd.DataFrame(columns=d_col)


# run the whole program for the defined set of pages and create 3 dataframes
for p in p['p']:
    # Analyze the requested PDF file
    pa = pdf_analyze(filename=f, page=p-1, cm=prm['cm'], lm=prm['lm'], wm=prm['wm'], bf=prm['bf'], lo=prm['lo'])
                                        #defaults: cm=2.0,   lm=0.5, wm=0.1, bf=0.5, lo=0.5
    print(pa['message'])
    
    # Extract data for the specified page
    d = extract_data(filename=f, page=p,layout=pa['layout'])
    print(d['message'])

    # Create the Summary & Detailed logs for the specified page
    log = create_logs(data=d['data'],filename=f, page=p, path=path)
    
    dfl = dfl.append(log['log'], sort=True) #append logs to total log dataframe
    dfs = dfs.append(log['summary'], sort=True) #append summary to total summary dataframe

    print(log['message'])

    # Format & append the Data for the specified page
    data =  create_data(data=d['data'],filename=f,path=path,page=p, header=out['header'], subheader=out['subheader'], body=out['bodystart'], spaces_desc=out['spaces_desc'])    
    df = df.append(data['data'], sort=True) #append data to total data dataframe
    print(data['message'])
        
    print()

# Reindex to set the columns the right way
dfs=dfs.reindex(columns=s_col)
dfl=dfl.reindex(columns=l_col)
df=df.reindex(columns=d_col)

print('Done!')

# 2) View Stuff

In [None]:


#dfs #view summary
#dfl
#dfl['char'][7] #view detailed logs
#df #view data

# 3) Write Stuff

In [None]:
#osp = osprep(base_path,filename)
#print(osp['message'])

# create a new folder if it doesn't exist already
if os.path.isdir(path):
    print('Using existing folder')
else:
    os.mkdir(path)
    print(f'Created a new folder: "/{filename}"')

data_out_name = os.path.join(path,filename) # create the output path & filename

dfs.to_csv(f'{data_out_name} - summary.csv', index=False,  mode='a', header=True)
print('write summary')

dfl.to_csv(f'{data_out_name} - log.csv', index=False, mode='a', header=True)
print('write logs')

df.to_csv(f'{data_out_name} - data.csv', index=False, mode='a', header=True)
print('write data')

# 4) Dependencies & Functions we'll use

In [None]:
##################################################
# Import our libraries
##################################################

# we primarily use 'pdfminer' to parse the pdf files 
# we use 'pandas' to convert the results into a useful table structure and export

import os
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
# From PDFInterpreter import both PDFResourceManager and PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
# Import this to raise exception whenever text extraction from PDF is not allowed
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.converter import PDFPageAggregator
import pandas as pd
import re

##################################################
# Create OS Functions
##################################################

def pgprep(pgstart, pgend):
    #create your list of pages
    try:
        if pgstart == pgend:
            p = [pgstart]
            pass
        else:
            p = list(range(pgstart, pgend+1))
    except:
        p = [pgstart]
    
    return {'message':f'Using {len(p)} pages from {pgstart} to {pgend}', 'p':p} 

##################################################
# Create PDF Parse Functions
##################################################
    
def pdf_analyze (filename, page, lm=0.5, cm=2.0, wm=0.1, bf=0.5, lo=0.5):
    # This is the main function for taking a specific page in a PDF file and 
    # parsing it into a LTpage object that can be further parsed 
    # the parameters drive the logic for 'where' the parsing happens.
    # more documentation on these params can be found in '0) documentation' above
    
    # Open and read the pdf file in binary mode
    fp = open(filename, "rb")

    # Create parser object to parse the pdf content
    parser = PDFParser(fp)

    # Store the parsed content in PDFDocument object
    document = PDFDocument(parser)

    # Check if document is extractable, if not abort
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    # Create PDFResourceManager object that stores shared resources such as fonts or images
    rsrcmgr = PDFResourceManager()

    # set parameters for analysis... these come from params in the function
    laparams = LAParams(
                        line_margin=lm, 
                        char_margin=cm, 
                        word_margin=wm,
                        boxes_flow=bf,
                        line_overlap=lo
                        )

    
    # Create a PDFDevice object which translates interpreted information into desired format
    # Device needs to be connected to resource manager to store shared resources
    # Extract the decive to page aggregator to get LT object elements
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    

    # Create interpreter object to process page content from PDFDocument
    # Interpreter needs to be connected to resource manager for shared resources and device 
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # put all pages into a list
    pgs = PDFPage.create_pages(document)
    allpages = []
    for p in pgs:
        allpages.append(p)
    
    # As the interpreter processes the page stored in PDFDocument object
    interpreter.process_page(allpages[page])

    # The device renders the layout from interpreter
    layout = device.get_result()

    return {'layout':layout,'message':f'Page {page+1}: Analyzed PDF'}



def extract_data(filename, page, layout):
    # this function takes the 'layout'object created from the parser and pulls out
    # the key pieces that we want, ignoring things like lines, images, etc.

    data = []
    for obj in layout:
        #print(obj)
        if isinstance(obj, LTTextBox) or isinstance(obj, LTTextLine):

            d = {
                'char': obj.get_text().split('\n'),
                'file': filename,
                'page': page,
                'charlen': len(obj.get_text()),
                'pos_BLx': obj.bbox[0],
                'pos_BLy': obj.bbox[1],
                'pos_TRx': obj.bbox[2],
                'pos_TRy': obj.bbox[3]
                #'fontname': child.fontname,
                #'fontsize': child.size
                }
            data.append(d)
    return {'data':data, 'message':f'Page {page}: Extracted Objects'}

##################################################
# Create Logs
##################################################

def create_logs (data, filename, path, page):
    # this function creates two files: 'file - log' and 'file - log summary'
    #
    # 'file - log' provides a breakdown of each container that the parser created, 
    # allowing the developer to troubleshoot data outputs that aren't correct
    #
    # 'file - log summary' provides a high level overview by page so we can see which pages
    # will parse easily and which onces need more investigation
    
    try:
        # try creating logs. if you get an error, then skip the logs
        
        #create a dataframe from the data passed to the function
        log = pd.DataFrame(data)
        
        #create a summary dataset to easier understand how the page is structured
        summary = {
                'filename': filename,
                'page': page,
                'loglen': len(log),
                'top': log['char'][0][0],
                'bottom': log['char'][len(log)-1][0]
            }

        logsum = pd.DataFrame(summary, index=[0])

        return {'message': f'Page {page}: Create Logs', 'summary':logsum, 'log':log }
    
    except:
        # if there's some error with the page, don't write anything and skip it
        
        # create an empty log df
        log = pd.DataFrame(columns=['char', 'charlen', 'file', 'page', 'pos_BLx', 'pos_BLy', 'pos_TRx', 'pos_TRy'])
        
        # create a blank row in the summary data
        summary = {
                'filename': filename,
                'page': page,
                'loglen': 0,
                'top': '!ERROR',
                'bottom': '!ERROR'
            }
        
        logsum = pd.DataFrame(summary, index=[0])

        return {'message': f'Page {page}: ERROR! No Logs', 'summary':logsum, 'log':log }


##################################################
# Create Final Data
##################################################
    
def create_data(data, filename, path, page, header=0, subheader=1, body=5, spaces_desc=2):
    # Here we go! Let's create the final table and put it in Excel  
    
    totrows = len(d['data']) # get the number of unique objects in the pdf
    rows_list = [] # create an empty list so we can put each object in it's own row

    for r in range(body,totrows-2): # magic happens here. Start looping through each object and append metadata
        ###
        ### First split the 'char' field into its useful parts ###
        ###
        
        char = d['data'][r]['char'][0]  #get the char piece from the dataset & write to the window       
        try:
            sp1 = char.split(' ',spaces_desc) #split the first two based on space

            # Is this a valid row of data?
            test_invalid = (re.match('.[#*()&%$@!+-]', sp1[0]))
            if test_invalid:
                #sp1 = [char] # if this is invalid, just return the data
                bo = {}
                bo = {'col_notes':char}
            else:
                # look for the asterisk
                test_ast = (re.match('^([A-Z]|[#*()^&%$@!+-])\s', sp1[2])) #or (re.match('^QR', sp1[2]))
                ast, bd ='',''
                if test_ast:
                    ast, bd = sp1[len(sp1)-1][:2], sp1[len(sp1)-1][2:]
                    sp1.pop(2)
                    sp1.extend([bd])

                # Break apart the Desc & Qty Fields
                test_desqt = (re.match('^[0-9]+', sp1[2])) or (re.match('^QR', sp1[2]))

                if test_desqt:
                    qty, desc = sp1[len(sp1)-1][:2], sp1[len(sp1)-1][2:]  #split the remaining desc/quantity field
                    sp1.pop(2) #get rid of the crappy desc/quantity combo field            
                    sp1.extend([desc,qty,ast]) #combine the new desc/qty fields back to the main output
                else:
                    sp1.extend([ast])

                # Convert sp1 into a dict that can be appended
                bo = {}
                i = 1
                for c in sp1:
                    bo.update( {f'col{i}' : c} )
                    i += 1       
        except:
            bo = {}
        ###
        ### First split the 'char' field into its useful parts ###
        ###
        
        mydict = {'file':filename, 
         'page':page,
         'header':d['data'][header]['char'][0],
         'subheader':d['data'][subheader]['char'][0],
         'localnum':d['data'][totrows-2]['char'][0],
         'revnum':d['data'][totrows-1]['char'][0],
         'pagepos' :r-body+1,
         #'char':char,
         #'lenchar':len(d['data'][r]['char'][0])
        }
        
        mydict.update(bo) #combine the metadata with the broken out table data
        rows_list.append(mydict)

    ###
    ### Finalize the data and output to excel if needed
    ###
    
    df = pd.DataFrame(rows_list) # put all the data into a dataframe to make things easy
    data_out_name = os.path.join(path,f"{filename} - data.csv") # create the output path & filename

    return {'message': f'Page {page}: Created Data', 'data':df}



### 99) Documentation

In [None]:
#https://gist.github.com/vinovator/c78c2cb63d62fdd9fb67

# pdfTextMiner.py
# Python 2.7.6
# For Python 3.x use pdfminer3k module
# This link has useful information on components of the program
# https://euske.github.io/pdfminer/programming.html
# http://denis.papathanasiou.org/posts/2010.08.04.post.html


''' Important classes to remember
PDFParser - fetches data from pdf file
PDFDocument - stores data parsed by PDFParser
PDFPageInterpreter - processes page contents from PDFDocument
PDFDevice - translates processed information from PDFPageInterpreter to whatever you need
PDFResourceManager - Stores shared resources such as fonts or images used by both PDFPageInterpreter and PDFDevice
LAParams - A layout analyzer returns a LTPage object for each page in the PDF document
PDFPageAggregator - Extract the device to page aggregator to get LT object elements
'''

'''
 Parameters for layout analysis
    
    :param line_overlap: If two characters have more overlap than this they
        are considered to be on the same line. The overlap is specified
        relative to the minimum height of both characters.
    
    :param char_margin: If two characters are closer together than this
        margin they are considered to be part of the same word. If
        characters are on the same line but not part of the same word, an
        intermediate space is inserted. The margin is specified relative to
        the width of the character.
    
    :param word_margin: If two words are are closer together than this
        margin they are considered to be part of the same line. A space is
        added in between for readability. The margin is specified relative
        to the width of the word.
    
    :param line_margin: If two lines are are close together they are
        considered to be part of the same paragraph. The margin is
        specified relative to the height of a line.
    
    :param boxes_flow: Specifies how much a horizontal and vertical position
        of a text matters when determining the order of text boxes. The value
        should be within the range of -1.0 (only horizontal position
        matters) to +1.0 (only vertical position matters).
    
    :param detect_vertical: If vertical text should be considered during
        layout analysis
    
    :param all_texts: If layout analysis should be performed on text in
        figures.
'''


#### useful sample file/page combinations

In [None]:
# example
# NF_SR2089 part 426010 on page 661

##########
#p = 570 #image only
#p = 40 #fancy image
#p = 1377 #contains comments
#p = 1435 #index table at endb
#p = 3 #table of contents
##########

#### useful documentation on parsing the 'char' data

In [None]:
'''
#def create_data(data, filename, path, page, header=0, subheader=1, body=5, write='false'):
  # Here we go! Let's create the final table and put it in Excel

p=857 #885
row = 15

#p=pg['pgstart']
# Analyze the requested PDF file
pa = pdf_analyze(filename=osp['f'], page=p-1, cm=prm['cm'], lm=prm['lm'], wm=prm['wm'], bf=prm['bf'], lo=prm['lo'])
                                    #defaults: cm=2.0,   lm=0.5, wm=0.1, bf=0.5, lo=0.5
print(pa['message'])

# Extract data for the specified page
d = extract_data(filename=osp['f'], page=p,layout=pa['layout'])
print(d['message'])

    
data= d['data']

#get the char piece from the dataset & write to the window
char = data[row]['char'][0]
#print(char)

#split the first two based on space
sp1 = char.split(' ',2)

#look for the asterisk
test_ast = (re.match('^[#*()&%$@!+-]\s', sp1[2])) #or (re.match('^QR', sp1[2]))
ast, body ='',''
if test_ast:
    ast, body = sp1[len(sp1)-1][:2], sp1[len(sp1)-1][2:]
    sp1.pop(2)
    sp1.extend([body])
    print(sp1)

    
#test for matches
test_desqt = (re.match('^[0-9]+', sp1[2])) or (re.match('^QR', sp1[2]))
if test_desqt:
    
    #split the remaining desc/quantity field
    qty, desc = sp1[len(sp1)-1][:2], sp1[len(sp1)-1][2:]

    #get rid of the crappy desc/quantity combo field
    sp1.pop(2)

    #combine the new desc/qty fields back to the main output
    sp1.extend([desc,qty, ast])
    #sp1
else:
    sp1.extend([ast])

sp1

'''