# Extracting text from PDF files

In [1]:
# To read the PDF
import PyPDF2
# To analyze the PDF layout and extract text
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
# To extract text from tables in PDF
import pdfplumber
# To extract the images from the PDFs
from PIL import Image
from pdf2image import convert_from_path
# To perform OCR to extract text from images 
import pytesseract 
# To remove the additional created files
import os

In [2]:
# Create function to extract text

def text_extraction(element):
    # Extracting the text from the in line text element
    line_text = element.get_text()
    
    # Find the formats of the text
    # Initialize the list with all the formats appeared in the line of text
    line_formats = []
    for text_line in element:
        if isinstance(text_line, LTTextContainer):
            # Iterating through each character in the line of text
            for character in text_line:
                if isinstance(character, LTChar):
                    # Append the font name of the character
                    line_formats.append(character.fontname)
                    # Append the font size of the character
                    line_formats.append(character.size)
    # Find the unique font sizes and names in the line
    format_per_line = list(set(line_formats))
    
    # Return a tuple with the text in each line along with its format
    return (line_text, format_per_line)

In [3]:
# Extracting tables from the page

def extract_table(pdf_path, page_num, table_num):
    # Open the pdf file
    pdf = pdfplumber.open(pdf_path)
    # Find the examined page
    table_page = pdf.pages[page_num]
    # Extract the appropriate table
    table = table_page.extract_tables()[table_num]
    
    return table

# Convert table into appropriate fromat
def table_converter(table):
    table_string = ''
    # Iterate through each row of the table
    for row_num in range(len(table)):
        row = table[row_num]
        # Remove the line breaker from the wrapted texts
        cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
        # Convert the table into a string 
        table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
    # Removing the last line break
    table_string = table_string[:-1]
    return table_string

In [4]:
# Create a function to crop the image elements from PDFs
def crop_image(element, pageObj):
    # Get the coordinates to crop the image from PDF
    [image_left, image_top, image_right, image_bottom] = [element.x0,element.y0,element.x1,element.y1] 
    # Crop the page using coordinates (left, bottom, right, top)
    pageObj.mediabox.lower_left = (image_left, image_bottom)
    pageObj.mediabox.upper_right = (image_right, image_top)
    # Save the cropped page to a new PDF
    cropped_pdf_writer = PyPDF2.PdfWriter()
    cropped_pdf_writer.add_page(pageObj)
    # Save the cropped PDF to a new file
    with open('cropped_image.pdf', 'wb') as cropped_pdf_file:
        cropped_pdf_writer.write(cropped_pdf_file)

# Create a function to convert the PDF to images
def convert_to_images(input_file,):
    images = convert_from_path(input_file)
    image = images[0]
    output_file = 'PDF_image.png'
    image.save(output_file, 'PNG')

# Create a function to read text from images
def image_to_text(image_path):
    # Read the image
    img = Image.open(image_path)
    # Extract the text from the image
    text = pytesseract.image_to_string(img)
    return text

In [5]:
# Find the PDF path
# pdf_path = 'Example PDF.pdf'

pdf_path = 'hban322.pdf'

# Create a pdf file object
pdfFileObj = open(pdf_path, 'rb')
# Create a pdf reader object
pdfReaded = PyPDF2.PdfReader(pdfFileObj)

In [6]:
# Create the dictionary to extract text from each image
text_per_page = {}
# We extract the pages from the PDF
for pagenum, page in enumerate(extract_pages(pdf_path)):

    print(f"pagenum={pagenum}")
    
    # Initialize the variables needed for the text extraction from the page
    pageObj = pdfReaded.pages[pagenum]
    print(f"pageObj={pageObj}")
    page_text = []
    line_format = []
    text_from_images = []
    text_from_tables = []
    page_content = []
    # Initialize the number of the examined tables
    table_num = 0
    first_element= True
    table_extraction_flag= False
    # Open the pdf file
    pdf = pdfplumber.open(pdf_path)
    # Find the examined page
    page_tables = pdf.pages[pagenum]
    print(f"page_table={page_tables}")
    # Find the number of tables in the page
    tables = page_tables.find_tables()

    print(f"tables={tables}")
    
    # Find all the elements
    page_elements = [(element.y1, element) for element in page._objs]
    # Sort all the element as they appear in the page 
    page_elements.sort(key=lambda a: a[0], reverse=True)

    # Find the elements that composed a page
    for i,component in enumerate(page_elements):
        # Extract the position of the top side of the element in PDF
        pos= component[0]
        # Extract the element of the page layout
        element = component[1]
        
        # Check if the element is text element
        if isinstance(element, LTTextContainer):
            # Check if the text appeared in a table
            if table_extraction_flag == False:
                # Use the function to extract the text and format for each text element
                (line_text, format_per_line) = text_extraction(element)
                # Append the text of each line to the page text
                page_text.append(line_text)
                # Append the format for each line containing text
                line_format.append(format_per_line)
                page_content.append(line_text)
            else:
                # Omit the text that appeared in a table
                pass

        # Check the elements for images
        if isinstance(element, LTFigure):
            # Crop the image from PDF
            crop_image(element, pageObj)
            # Convert the croped pdf to image
            convert_to_images('cropped_image.pdf')
            # Extract the text from image
            image_text = image_to_text('PDF_image.png')
            text_from_images.append(image_text)
            page_content.append(image_text)
            # Add a placeholder in the text and format lists
            page_text.append('image')
            line_format.append('image')

        # Check the elements for tables
        # if isinstance(element, LTRect):
        #     # If first rectacular element
        #     if first_element == True and (table_num+1) <= len(tables):
        #         # Find the bounding box of the table
        #         lower_side = page.bbox[3] - tables[table_num].bbox[3]
        #         upper_side = element.y1 
        #         # Extract the information of the table
        #         table = extract_table(pdf_path, pagenum, table_num)
        #         # Convert the table information in structured string format
        #         table_string = table_converter(table)
        #         # Append the table string into a list
        #         text_from_tables.append(table_string)
        #         page_content.append(table_string)
        #         # Set the flag as True to avoid the content again
        #         table_extraction_flag = True
        #         # Make it other element
        #         first_element = False
        #         # Add a placeholder in the text and format lists
        #         page_text.append('table')
        #         line_format.append('table')

        #     # Check if we alread extracted the tables from the page
        #     if element.y0 >= lower_side and element.y1 <= upper_side:
        #         pass
        #     elif not isinstance(page_elements[i+1][1], LTRect):
        #         table_extraction_flag = False
        #         first_element = True
        #         table_num+=1


    # Create the key of the dictionary
    dctkey = 'Page_'+str(pagenum)
    # Add the list of list as value of the page key
    text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]

pagenum=0
pageObj={'/Contents': [IndirectObject(488, 0, 139918487630800), IndirectObject(489, 0, 139918487630800), IndirectObject(490, 0, 139918487630800), IndirectObject(492, 0, 139918487630800), IndirectObject(493, 0, 139918487630800), IndirectObject(494, 0, 139918487630800), IndirectObject(496, 0, 139918487630800), IndirectObject(497, 0, 139918487630800)], '/CropBox': [0, 0, 612, 792], '/MediaBox': [0, 0, 612, 792], '/Parent': IndirectObject(472, 0, 139918487630800), '/Resources': IndirectObject(516, 0, 139918487630800), '/Rotate': 0, '/Type': '/Page'}
page_table=<Page:1>
tables=[]
pagenum=1
pageObj={'/Contents': IndirectObject(2, 0, 139918487630800), '/CropBox': [0, 0, 612, 792], '/MediaBox': [0, 0, 612, 792], '/Parent': IndirectObject(472, 0, 139918487630800), '/Resources': IndirectObject(152, 0, 139918487630800), '/Rotate': 0, '/Type': '/Page'}
page_table=<Page:2>
tables=[<pdfplumber.table.Table object at 0x7f4121203f10>]
pagenum=2
pageObj={'/Contents': IndirectObject(5, 0, 13991

In [10]:
# Close the pdf file object
pdfFileObj.close()

In [8]:
# Delete the additional files created
os.remove('cropped_image.pdf')
os.remove('PDF_image.png')

In [7]:
# Display the content of the page
result = ''.join(text_per_page['Page_0'][4])
print(result)

Exhibit	99.1
(4) Huntington
October	21,	2022
Analysts:	Tim	Sedabres	(timothy.sedabres@huntington.com),	952.745.2766
Media:	Seth	Seymour	(corpmedia@huntington.com),	614.480.3538
HUNTINGTON	BANCSHARES	INCORPORATED	REPORTS	2022	THIRD-QUARTER	EARNINGS
Second	Consecutive	Quarter	of	Record	Net	Income	with	Total	Revenue	Up	9%	Sequentially
2022	Third-Quarter	Highlights:
•
Earnings	per	common	share	(EPS)	for	the	quarter	were	$0.39,	an	increase	of	$0.04	from	the	prior	quarter.
• Net	interest	income	increased	$143	million,	or	11%,	from	the	prior	quarter,	reflecting	net	interest	margin	
expansion	of	27	basis	points	to	3.42%	and	higher	average	total	loans	and	leases.
• Noninterest	income	increased	$13	million,	or	3%,	from	the	prior	quarter,	reflecting	strength	in	capital	
markets	supported	by	full	quarter	impact	of	Capstone.
•
Pre-Provision	Net	Revenue	(PPNR),	excluding	Notable	Items,	increased	$109	million,	or	14%,	from	the	prior	
quarter	to	$867	million.
• Average	total	loans	and	leases	increase

In [8]:
# Display the content of the page
result = ''.join(text_per_page['Page_1'][4])
print(result)

COLUMBUS,	Ohio	–	Huntington	Bancshares	Incorporated	(Nasdaq:	HBAN)	reported	net	income	for	the	2022	
third	quarter	of	$594	million,	or	$0.39	per	common	share,	an	increase	of	$217	million,	or	$0.17	per	common	
share	from	the	year-ago	quarter.
Return	on	average	assets	was	1.31%,	return	on	average	common	equity	was	13.9%,	return	on	average	
tangible	common	equity	(ROTCE)	was	21.9%,	and	adjusted	ROTCE	was	22.2%.	
CEO	Commentary:
"We	are	very	pleased	with	our	performance	this	quarter,	which	included	record	net	earnings	for	the	
second	consecutive	quarter,"	said	Steve	Steinour,	chairman,	president	and	CEO.	"We	successfully	executed	on	
our	business	strategies,	delivering	robust	loan	growth,	higher	deposit	balances,	and	expanded	fee	income.	These	
results,	and	the	benefit	from	the	higher	interest	rate	environment,	combined	to	produce	sequential	pre-
provision	net	revenue	growth	of	14%	for	the	quarter	when	excluding	Notable	Items.
"We	continue	to	make	strategic	investments	to	drive	sustainable

In [9]:
# Display the content of the page
result = ''.join(text_per_page['Page_2'][4])
print(result)

Table	1	–	Earnings	Performance	Summary
2022
2021
Third
Second
First
Fourth
Third
(in	millions,	except	per	share	data)
Quarter
Quarter
Quarter
Quarter
Quarter
Net	income	attributable	to	Huntington	Bancshares	Inc
$	
$	
594	
$	
539	
$	
460	
$	
401	
377	
Diluted	earnings	per	common	share
0.39	
0.35	
0.29	
0.26	
0.22	
	
	
	
	
	
Return	on	average	assets
	1.31	%
	1.22	%
	1.05	%
	0.92	%
	0.86	%
Return	on	average	common	equity
	13.9	
	12.8	
	10.4	
	8.7	
	7.6	
Return	on	average	tangible	common	equity
	21.9	
	19.9	
	15.8	
	13.2	
	11.5	
Net	interest	margin
	3.42	
	3.15	
	2.88	
	2.85	
	2.91	
Efficiency	ratio
	54.4	
	57.3	
	62.9	
	73.0	
	74.9	
Tangible	book	value	per	common	share
$	
6.40	
$	
6.96	
$	
7.47	
$	
8.06	
$	
8.09	
Cash	dividends	declared	per	common	share
0.155	
0.155	
0.155	
0.155	
0.15	
	
	
	
	
	
Average	earning	assets
$	 164,024	
$	 161,225	
$	 162,414	
$	 158,692	
$	 159,148	
Average	loans	and	leases
116,964	
113,949	
111,142	
109,488	
109,668	
	
	
	
	
	
Average	core	deposits
141,691	
1