# Layout Parser Tutorial

In this notebook, we will be implementing a Python package called LayoutParser. We will be taking an image-recognition approach to see if perhaps we can do document segmentation that way. 

In [1]:
# Need this for Tesseract to function properly
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [2]:
# Delete unused packages once idea is fully realized
import numpy as np
import pdfkit
#import fpdf
import pandas as pd
import os
import pickle
import time
import json
import requests
import re
import urllib
from bs4 import BeautifulSoup

from pdf2image import convert_from_path, convert_from_bytes
from pytesseract import pytesseract
import layoutparser as lp
import matplotlib.pyplot as plt
import cv2 

from datetime import datetime

#import pytesseract
from PIL import ImageEnhance, ImageFilter, Image

import lxml.html


First, let's see if we can convert our documents to a picture right away. This may make the process significantly easier.

In [3]:
with open('../Step1-Data/2-link_dict.pickle', 'rb') as f:
    link_dict = pickle.load(f)

In [4]:
sample = link_dict['0000006201'].copy()
sample_link = sample['Link'].values[0]

In [5]:
pdfkit.from_url(sample_link, 'out.pdf')

Loading pages (1/6)
Printing pages (6/6)


True

In [6]:
from tika import parser

rawText = parser.from_file('Out.pdf')

rawList = rawText['content'].splitlines()

In [7]:
rawList[200:220]

['General',
 'This\treport\tis\tfiled\tby\tAmerican\tAirlines\tGroup\tInc.\t(AAG)\tand\tits\twholly-owned\tsubsidiary\tAmerican\tAirlines,\tInc.\t(American).\tReferences\tin\tthis\tAnnual',
 '',
 'Report\ton\tForm\t10-K\tto\t“we,”\t“us,”\t“our,”\tthe\t“Company”\tand\tsimilar\tterms\trefer\tto\tAAG\tand\tits\tconsolidated\tsubsidiaries.\t“AMR”\tor\t“AMR\tCorporation”',
 'refers\tto\tthe\tCompany\tduring\tthe\tperiod\tof\ttime\tprior\tto\tits\temergence\tfrom\tChapter\t11\tand\tits\tacquisition\tof\tUS\tAirways\tGroup,\tInc.\t(US\tAirways\tGroup)',
 'on\tDecember\t9,\t2013\t(the\tMerger).\tReferences\tto\tUS\tAirways\tGroup\tand\tUS\tAirways,\tInc.,\ta\tsubsidiary\tof\tUS\tAirways\tGroup\t(US\tAirways),\trepresent',
 'the\tentities\tduring\tthe\tperiod\tof\ttime\tprior\tto\tthe\tdissolution\tof\tthose\tentities\tin\tconnection\twith\tAAG’s\tinternal\tcorporate\trestructuring\ton\tDecember\t30,',
 '2015.\tReferences\tin\tthis\treport\tto\t“mainline”\trefer\tto\tthe\toperations\tof\tAmeric

In [8]:
result = [x for x in rawList if x[:4] == 'ITEM']

In [9]:
result

['ITEM\t1.\tBUSINESS',
 'ITEM\t1A.\tRISK\tFACTORS',
 'ITEM\t1B.\tUNRESOLVED\tSTAFF\tCOMMENTS',
 'ITEM\t2.\tPROPERTIES',
 'ITEM\t3.\t\tLEGAL\tPROCEEDINGS',
 'ITEM\t4.\t\tMINE\tSAFETY\tDISCLOSURES',
 'ITEM\t5.\t\t\t\tMARKET\tFOR\tAMERICAN\tAIRLINES\tGROUP’S\tCOMMON\tSTOCK,\tRELATED\tSTOCKHOLDER\tMATTERS\tAND\tISSUER\tPURCHASES',
 'ITEM\t6.\t\tSELECTED\tCONSOLIDATED\tFINANCIAL\tDATA',
 'ITEM\t7.\t\tMANAGEMENT’S\tDISCUSSION\tAND\tANALYSIS\tOF\tFINANCIAL\tCONDITION\tAND\tRESULTS\tOF\tOPERATIONS',
 'ITEM\t7A.\tQUANTITATIVE\tAND\tQUALITATIVE\tDISCLOSURES\tABOUT\tMARKET\tRISK',
 'ITEM\t8A.\tCONSOLIDATED\tFINANCIAL\tSTATEMENTS\tAND\tSUPPLEMENTARY\tDATA\tOF\tAMERICAN\tAIRLINES\tGROUP\tINC.',
 'ITEM\t8B.\tCONSOLIDATED\tFINANCIAL\tSTATEMENTS\tAND\tSUPPLEMENTARY\tDATA\tOF\tAMERICAN\tAIRLINES,\tINC.',
 'ITEM\t9.\t\tCHANGES\tIN\tAND\tDISAGREEMENTS\tWITH\tACCOUNTANTS\tON\tACCOUNTING\tAND\tFINANCIAL\tDISCLOSURE',
 'ITEM\t9A.\t\tCONTROLS\tAND\tPROCEDURES',
 'ITEM\t9B.\t\tOTHER\tINFORMATION',
 'ITEM\t10.

I wonder if this strategy will work for all forms? This would be very handy... Let's see if we can extract the necessary information:

In [10]:
matches = re.compile(r'^ITEM.*1[^A][^A].*BUSINESS')
item7 = [matches.search(x) for x in result]

#rawList.index([-1])
[x for x in rawList if matches.search(x) != None]

['ITEM\t1.\tBUSINESS']

In [11]:
idx7 = rawList.index([x for x in rawList if matches.search(x) != None][-1])

In [23]:
matches = re.compile(r'^ITEM.*8[^B][^B]')
item7 = [matches.search(x) for x in result]
[x for x in rawList if matches.search(x) != None]

['ITEM\t8A.\tCONSOLIDATED\tFINANCIAL\tSTATEMENTS\tAND\tSUPPLEMENTARY\tDATA\tOF\tAMERICAN\tAIRLINES\tGROUP\tINC.']

In [24]:
idx8 = rawList.index([x for x in rawList if matches.search(x) != None][-1])

In [25]:
item7str = ' '.join(rawList[idx7:idx8]).replace('\t', ' ')
#item7str

# Expand to All Filings For One Company

In [33]:
options = { 'quiet': '' }
start = time.time()
item7_company = []
for link in link_dict['0000006201']['Link'].values:
    pdfkit.from_url(link, 'out.pdf', options=options)
    rawText = parser.from_file('out.pdf')
    rawList = rawText['content'].splitlines()
    result = [x for x in rawList if x[:4] == 'ITEM']
    
    try:
        ##### ITEM 7
        matches = re.compile(r'^ITEM.*7[^A][^A]')
        item7 = [matches.search(x) for x in result]
        idx7 = rawList.index([x for x in rawList if matches.search(x) != None][-1])

        ##### ITEM 8
        matches = re.compile(r'^ITEM.*8[^B][^B]')
        item7 = [matches.search(x) for x in result]
        idx8 = rawList.index([x for x in rawList if matches.search(x) != None][-1])

        item7_company.append(' '.join(rawList[idx7:idx8]).replace('\t', ' '))
    except:
        item7_company.append('')
    
end = time.time()

In [34]:
print(end-start)

70.30988907814026


In [35]:
link_dict['0000006201']['Item7'] = item7_company

In [14]:
link_dict['0000006201']

Unnamed: 0,accessionNumber,filingDate,reportDate,acceptanceDateTime,act,form,fileNumber,filmNumber,items,size,isXBRL,isInlineXBRL,primaryDocument,primaryDocDescription,Link,Loc7,Loc8
49,0000006201-21-000014,2021-02-17,2020-12-31,2021-02-17T17:17:57.000Z,34,10-K,001-08400,21646186,,43925703,1,1,aal-20201231.htm,10-K 2020 02.17.21,https://www.sec.gov/Archives/edgar/data/6201/0...,"[[(158292, 158310), Item 7. Management], [(167...","[[(502641, 502662), ITEM 8A. CONSOLIDATED]]"
150,0000006201-20-000023,2020-02-19,2019-12-31,2020-02-19T07:31:30.000Z,34,10-K,001-08400,20627428,,30851334,1,1,a10k123119.htm,10-K 2019 02.19.20,https://www.sec.gov/Archives/edgar/data/6201/0...,"[[(153128, 153146), Item 7. Management], [(156...","[[(414897, 414918), ITEM 8A. CONSOLIDATED]]"
225,0000006201-19-000009,2019-02-25,2018-12-31,2019-02-25T07:31:34.000Z,34,10-K,001-08400,19628071,,30572408,1,0,a10k123118.htm,10-K 2018 02.25.19,https://www.sec.gov/Archives/edgar/data/6201/0...,"[[(9505, 9523), Item 7. Management], [(12796, ...","[[(300867, 300888), ITEM 8A. CONSOLIDATED]]"
315,0000006201-18-000009,2018-02-21,2017-12-31,2018-02-21T08:02:40.000Z,34,10-K,001-08400,18627088,,27914491,1,0,a10k123117.htm,10-K,https://www.sec.gov/Archives/edgar/data/6201/0...,"[[(9554, 9572), Item 7. Management], [(13606, ...","[[(293380, 293401), ITEM 8A. CONSOLIDATED]]"
412,0001193125-17-051216,2017-02-22,2016-12-31,2017-02-22T08:01:43.000Z,34,10-K,001-08400,17627073,,24888480,1,0,d286458d10k.htm,FORM 10-K,https://www.sec.gov/Archives/edgar/data/6201/0...,"[[(9935, 9953), Item 7. Management], [(14047, ...","[[(297249, 297270), ITEM 8A. CONSOLIDATED]]"
538,0001193125-16-474605,2016-02-24,2015-12-31,2016-02-24T08:04:10.000Z,34,10-K,001-08400,161450518,,26170400,1,0,d78287d10k.htm,FORM 10-K,https://www.sec.gov/Archives/edgar/data/6201/0...,"[[(17027, 17045), Item 7. Management], [(21453...","[[(398001, 398022), ITEM 8A. CONSOLIDATED]]"
651,0001193125-15-061145,2015-02-25,2014-12-31,2015-02-25T08:02:34.000Z,34,10-K,001-08400,15645918,,39524925,1,0,d829913d10k.htm,FORM 10-K,https://www.sec.gov/Archives/edgar/data/6201/0...,"[[(16174, 16192), Item 7. Management], [(23008...","[[(452689, 452710), ITEM 8A. CONSOLIDATED]]"
750,0000006201-14-000004,2014-02-28,2013-12-31,2014-02-28T07:52:16.000Z,34,10-K,001-08400,14651496,,47888955,1,0,aagaa10k-20131231.htm,10-K,https://www.sec.gov/Archives/edgar/data/6201/0...,"[[(15590, 15608), Item 7. Management], [(23363...",[]


In [4]:
with open('../Step1-Data/8-link_dict.pickle', 'rb') as f:
    link_dict = pickle.load(f)

In [14]:
len(link_dict.keys())

985

# Extend to All Companies

In [12]:
options = { 'quiet': '' }

counter = 0
for company in list(link_dict.keys()):
    
    now = datetime.now()
    print(counter, ' - ', now.strftime("%H:%M:%S"))
    item1_company = []
    item1a_company = []
    item7_company = []
    for link in link_dict[company]['Link'].values:
        try:
            pdfkit.from_url(link, 'out.pdf', options=options)
            
        except:
            item1_company.append('')
            item1a_company.append('')
            item7_company.append('')
            continue
            
        rawText = parser.from_file('out.pdf')
        rawList = rawText['content'].splitlines()
        result = [x for x in rawList if re.match('ITEM', x[:4], re.IGNORECASE)]
        
        try:
            ##### ITEM 1
            matches = re.compile(r'^ITEM.*1[^A].*BUSINESS', re.IGNORECASE)
            #item7 = [matches.search(x) for x in result]
            idx1 = rawList.index([x for x in rawList if matches.search(x) != None][-1])

            ##### ITEM 1A
            matches = re.compile(r'^ITEM.*1(A| A)', re.IGNORECASE)
            #item7 = [matches.search(x) for x in result]
            idx1a = rawList.index([x for x in rawList if matches.search(x) != None][-1])
            
            item1_company.append(' '.join(rawList[idx1:idx1a]).replace('\t', ' '))
        
        except:
            item1_company.append('')
        
        try:
            ##### ITEM 1A
            matches = re.compile(r'^ITEM.*1(A| A)', re.IGNORECASE)
            #item7 = [matches.search(x) for x in result]
            idx1a = rawList.index([x for x in rawList if matches.search(x) != None][-1])

            ##### ITEM 1B
            matches = re.compile(r'^ITEM.*1(B| B)', re.IGNORECASE)
            #item7 = [matches.search(x) for x in result]
            idx1b = rawList.index([x for x in rawList if matches.search(x) != None][-1])
            
            item1a_company.append(' '.join(rawList[idx1a:idx1b]).replace('\t', ' '))
        
        except:
            item1a_company.append('')

        try:
            
            ##### ITEM 7
            matches = re.compile(r'^ITEM.*7[^A][^A]', re.IGNORECASE)
            #item7 = [matches.search(x) for x in result]
            idx7 = rawList.index([x for x in rawList if matches.search(x) != None][-1])

            ##### ITEM 8
            matches = re.compile(r'^ITEM.*8[^B][^B]', re.IGNORECASE)
            #item7 = [matches.search(x) for x in result]
            idx8 = rawList.index([x for x in rawList if matches.search(x) != None][-1])

            item7_company.append(' '.join(rawList[idx7:idx8]).replace('\t', ' '))
        except:
            item7_company.append('')
    
    link_dict[company]['Item1']  = item1_company
    link_dict[company]['Item1a'] = item1a_company
    link_dict[company]['Item7']  = item7_company
    
    if counter % 50 == 0:
        with open('8-link_dict.pickle', 'wb') as handle:
            pickle.dump(link_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
    counter += 1
    
with open('8-link_dict.pickle', 'wb') as handle:
            pickle.dump(link_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

0  -  23:02:01
1  -  23:03:13
2  -  23:03:40
3  -  23:04:55
4  -  23:05:34
5  -  23:06:12
6  -  23:06:53
7  -  23:07:33
8  -  23:08:00
9  -  23:10:07
10  -  23:10:55
11  -  23:11:31
12  -  23:12:01
13  -  23:13:10
14  -  23:13:44
15  -  23:13:59
16  -  23:14:34
17  -  23:14:34
18  -  23:15:28
19  -  23:15:28
20  -  23:16:01
21  -  23:16:43
22  -  23:17:27
23  -  23:18:21
24  -  23:19:11
25  -  23:19:33
26  -  23:21:31
27  -  23:22:22
28  -  23:24:33
29  -  23:24:58
30  -  23:27:07
31  -  23:27:11
32  -  23:28:13
33  -  23:28:29
34  -  23:29:51
35  -  23:30:27
36  -  23:31:35
37  -  23:32:46
38  -  23:33:34
39  -  23:34:09
40  -  23:34:52
41  -  23:35:37
42  -  23:36:38
43  -  23:37:25
44  -  23:38:04
45  -  23:39:55
46  -  23:40:44
47  -  23:41:03
48  -  23:41:55
49  -  23:42:12
50  -  23:43:03
51  -  23:43:55
52  -  23:44:08
53  -  23:44:47
54  -  23:45:20
55  -  23:46:13
56  -  23:46:56
57  -  23:47:40
58  -  23:48:43
59  -  23:49:36
60  -  23:50:06
61  -  23:50:55
62  -  23:51:37
63

489  -  05:29:14
490  -  05:29:50
491  -  05:30:12
492  -  05:31:25
493  -  05:32:07
494  -  05:33:07
495  -  05:33:48
496  -  05:34:49
497  -  05:35:31
498  -  05:35:31
499  -  05:35:44
500  -  05:36:09
501  -  05:36:54
502  -  05:37:48
503  -  05:39:18
504  -  05:40:19
505  -  05:41:43
506  -  05:42:20
507  -  05:43:15
508  -  05:44:04
509  -  05:45:17
510  -  05:46:02
511  -  05:46:35
512  -  05:47:05
513  -  05:48:28
514  -  05:49:21
515  -  05:50:08
516  -  05:50:36
517  -  05:50:46
518  -  05:50:59
519  -  05:51:39
520  -  05:52:23
521  -  05:52:48
522  -  05:53:36
523  -  05:54:30
524  -  05:55:27
525  -  05:55:55
526  -  05:56:32
527  -  05:56:51
528  -  05:56:51
529  -  05:58:08
530  -  05:58:49
531  -  05:59:29
532  -  05:59:55
533  -  06:00:18
534  -  06:01:15
535  -  06:03:04
536  -  06:03:32
537  -  06:04:05
538  -  06:05:17
539  -  06:06:07
540  -  06:07:54
541  -  06:09:45
542  -  06:10:16
543  -  06:10:16
544  -  06:10:59
545  -  06:12:13
546  -  06:13:09
547  -  06:14:

971  -  13:40:45
972  -  13:41:45
973  -  13:41:55
974  -  13:43:01
975  -  13:43:48
976  -  13:44:26
977  -  13:44:48
978  -  13:45:49
979  -  13:47:00
980  -  13:48:17
981  -  13:49:45
982  -  13:50:48
983  -  13:52:19
984  -  13:53:23


In [9]:
with open('6-link_dict.pickle', 'wb') as handle:
    pickle.dump(link_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [467]:
list(link_dict.keys()).index(company)

765

In [None]:
https://www.sec.gov/Archives/edgar/data/835910/000083591001000012/0000835910-01-000012.txt

In [384]:
for link in link_dict['0000006201']['Link'].values:
    pdfkit.from_url(link, 'out.pdf')
    rawText = parser.from_file('out.pdf')
    rawList = rawText['content'].splitlines()
    result = [x for x in rawList if x[:4] == 'ITEM']
    print(len(result))

Loading pages (1/6)
Printing pages (6/6)


22
Loading pages (1/6)
Counting pages (2/6)                                               
Resolving links (4/6)                                                       
Loading headers and footers (5/6)                                           
Printing pages (6/6)
Done                                                                          
22
Loading pages (1/6)
Counting pages (2/6)                                               
Resolving links (4/6)                                                       
Loading headers and footers (5/6)                                           
Printing pages (6/6)
Done                                                                          
21
Loading pages (1/6)
Counting pages (2/6)                                               
Resolving links (4/6)                                                       
Loading headers and footers (5/6)                                           
Printing pages (6/6)
Done                                        

In [None]:
response = json.loads(requests.get("https://data.sec.gov/submissions/CIK"+cik+".json", headers=heads).text)

In [1]:
badslist = []
for company in companies:
    bads = [len(x) for x in link_dict[company]['Item7'].values]
    if bads.count(0) > 0:
        badslist.append(company)

NameError: name 'companies' is not defined

In [8]:
goodslist = [x for x in link_dict.keys() if x not in badslist]

NameError: name 'badslist' is not defined

In [27]:
company = '0001158449'

In [28]:
now = datetime.now()
print(counter, ' - ', now.strftime("%H:%M:%S"))
item1_company = []
item1a_company = []
item7_company = []
for link in link_dict[company]['Link'].values:
    try:
        pdfkit.from_url(link, 'out.pdf', options=options)

    except:
        item1_company.append('')
        item1a_company.append('')
        item7_company.append('')
        continue

    rawText = parser.from_file('out.pdf')
    rawList = rawText['content'].splitlines()
    result = [x for x in rawList if re.match('ITEM', x[:4], re.IGNORECASE)]

    try:
        ##### ITEM 1
        matches = re.compile(r'^ITEM.*1[^A][^A].*BUSINESS', re.IGNORECASE)
        #item7 = [matches.search(x) for x in result]
        idx1 = rawList.index([x for x in rawList if matches.search(x) != None][-1])

        ##### ITEM 1A
        matches = re.compile(r'^ITEM.*1(A| A)', re.IGNORECASE)
        #item7 = [matches.search(x) for x in result]
        idx1a = rawList.index([x for x in rawList if matches.search(x) != None][-1])

        item1_company.append(' '.join(rawList[idx1:idx1a]).replace('\t', ' '))

    except:
        item1_company.append('')

    try:
        ##### ITEM 1A
        matches = re.compile(r'^ITEM.*1(A| A)', re.IGNORECASE)
        #item7 = [matches.search(x) for x in result]
        idx1a = rawList.index([x for x in rawList if matches.search(x) != None][-1])

        ##### ITEM 1B
        matches = re.compile(r'^ITEM.*1(B| B)', re.IGNORECASE)
        #item7 = [matches.search(x) for x in result]
        idx1b = rawList.index([x for x in rawList if matches.search(x) != None][-1])

        item1a_company.append(' '.join(rawList[idx1a:idx1b]).replace('\t', ' '))

    except:
        item1a_company.append('')

    try:

        ##### ITEM 7
        matches = re.compile(r'^ITEM.*7[^A][^A]', re.IGNORECASE)
        #item7 = [matches.search(x) for x in result]
        idx7 = rawList.index([x for x in rawList if matches.search(x) != None][-1])

        ##### ITEM 8
        matches = re.compile(r'^ITEM.*8[^B][^B]', re.IGNORECASE)
        #item7 = [matches.search(x) for x in result]
        idx8 = rawList.index([x for x in rawList if matches.search(x) != None][-1])

        item7_company.append(' '.join(rawList[idx7:idx8]).replace('\t', ' '))
    except:
        item7_company.append('')

link_dict[company]['Item1']  = item1_company
link_dict[company]['Item1a'] = item1a_company
link_dict[company]['Item7']  = item7_company

23  -  20:51:26


ValueError: Length of values (1) does not match length of index (8)

result

In [30]:
result

['Item\t1.\t\t\t\tBusiness.',
 'Item\t1A.\tRisk\tFactors.',
 'Item\t1B.\tUnresolved\tStaff\tComments.',
 'Item\t2.\tProperties.',
 'Item\t3.\tLegal\tProceedings.',
 'Item\t4.\tMine\tSafety\tDisclosures.',
 'Item\t5. Market\t for\t Registrant’s\t Common\t Equity,\t Related\t Stockholder\t Matters\t and\t Issuer\t Purchases\t of\t Equity',
 'Item\t6.\t\t\t\tSelected\tConsolidated\tFinancial\tData.',
 'Item\t7.\tManagement’s\tDiscussion\tand\tAnalysis\tof\tFinancial\tCondition\tand\tResults\tof\tOperations.',
 'Item\t7A.\tQuantitative\tand\tQualitative\tDisclosures\tabout\tMarket\tRisks.',
 'Item\t8.\tFinancial\tStatements\tand\tSupplementary\tData.',
 'Item\t9.\tChanges\tin\tand\tDisagreements\twith\tAccountants\ton\tAccounting\tand\tFinancial\tDisclosure.',
 'Item\t9A.\tControls\tand\tProcedures.',
 'Item\t9B.\tOther\tInformation.',
 'Item\t10.\tDirectors,\tExecutive\tOfficers\tand\tCorporate\tGovernance.',
 'Item\t11.\tExecutive\tCompensation.',
 'Item\t12.\tSecurity\tOwnership\tof\tCe

In [38]:
##### ITEM 1
matches = re.compile(r'^ITEM.*1.*BUSINESS', re.IGNORECASE)
#item7 = [matches.search(x) for x in result]
idx1 = rawList.index([x for x in rawList if matches.search(x) != None][-1])

##### ITEM 1A
matches = re.compile(r'^ITEM.*1(A| A)', re.IGNORECASE)
#item7 = [matches.search(x) for x in result]
idx1a = rawList.index([x for x in rawList if matches.search(x) != None][-1])

item1_company.append(' '.join(rawList[idx1:idx1a]).replace('\t', ' '))

In [40]:
idx1

203