# Layout Parser Tutorial

In this notebook, we will be implementing a Python package called LayoutParser. We will be taking an image-recognition approach to see if perhaps we can do document segmentation that way. 

In [1]:
# Need this for Tesseract to function properly
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [2]:
# Delete unused packages once idea is fully realized
import numpy as np
import pdfkit
#import fpdf
import pandas as pd
import os
import pickle
import time
import json
import requests
import re
import urllib
from bs4 import BeautifulSoup

from pdf2image import convert_from_path, convert_from_bytes
from pytesseract import pytesseract
import layoutparser as lp
import matplotlib.pyplot as plt
import cv2 

from datetime import datetime

#import pytesseract
from PIL import ImageEnhance, ImageFilter, Image

import lxml.html


First, let's see if we can convert our documents to a picture right away. This may make the process significantly easier.

In [3]:
with open('../Step1-Data/2-link_dict.pickle', 'rb') as f:
    link_dict = pickle.load(f)

In [4]:
sample = link_dict['0000006201'].copy()
sample_link = sample['Link'].values[0]

In [10]:
pdfkit.from_url(sample_link, 'out.pdf')

Loading pages (1/6)
Printing pages (6/6)


True

In [13]:
from tika import parser

rawText = parser.from_file('Out.pdf')

rawList = rawText['content'].splitlines()

In [17]:
rawList[200:220]

['General',
 'This\treport\tis\tfiled\tby\tAmerican\tAirlines\tGroup\tInc.\t(AAG)\tand\tits\twholly-owned\tsubsidiary\tAmerican\tAirlines,\tInc.\t(American).\tReferences\tin\tthis\tAnnual',
 '',
 'Report\ton\tForm\t10-K\tto\t“we,”\t“us,”\t“our,”\tthe\t“Company”\tand\tsimilar\tterms\trefer\tto\tAAG\tand\tits\tconsolidated\tsubsidiaries.\t“AMR”\tor\t“AMR\tCorporation”',
 'refers\tto\tthe\tCompany\tduring\tthe\tperiod\tof\ttime\tprior\tto\tits\temergence\tfrom\tChapter\t11\tand\tits\tacquisition\tof\tUS\tAirways\tGroup,\tInc.\t(US\tAirways\tGroup)',
 'on\tDecember\t9,\t2013\t(the\tMerger).\tReferences\tto\tUS\tAirways\tGroup\tand\tUS\tAirways,\tInc.,\ta\tsubsidiary\tof\tUS\tAirways\tGroup\t(US\tAirways),\trepresent',
 'the\tentities\tduring\tthe\tperiod\tof\ttime\tprior\tto\tthe\tdissolution\tof\tthose\tentities\tin\tconnection\twith\tAAG’s\tinternal\tcorporate\trestructuring\ton\tDecember\t30,',
 '2015.\tReferences\tin\tthis\treport\tto\t“mainline”\trefer\tto\tthe\toperations\tof\tAmeric

In [19]:
result = [x for x in rawList if x[:4] == 'ITEM']

In [20]:
result

['ITEM\t1.\tBUSINESS',
 'ITEM\t1A.\tRISK\tFACTORS',
 'ITEM\t1B.\tUNRESOLVED\tSTAFF\tCOMMENTS',
 'ITEM\t2.\tPROPERTIES',
 'ITEM\t3.\t\tLEGAL\tPROCEEDINGS',
 'ITEM\t4.\t\tMINE\tSAFETY\tDISCLOSURES',
 'ITEM\t5.\t\t\t\tMARKET\tFOR\tAMERICAN\tAIRLINES\tGROUP’S\tCOMMON\tSTOCK,\tRELATED\tSTOCKHOLDER\tMATTERS\tAND\tISSUER\tPURCHASES',
 'ITEM\t6.\t\tSELECTED\tCONSOLIDATED\tFINANCIAL\tDATA',
 'ITEM\t7.\t\tMANAGEMENT’S\tDISCUSSION\tAND\tANALYSIS\tOF\tFINANCIAL\tCONDITION\tAND\tRESULTS\tOF\tOPERATIONS',
 'ITEM\t7A.\tQUANTITATIVE\tAND\tQUALITATIVE\tDISCLOSURES\tABOUT\tMARKET\tRISK',
 'ITEM\t8A.\tCONSOLIDATED\tFINANCIAL\tSTATEMENTS\tAND\tSUPPLEMENTARY\tDATA\tOF\tAMERICAN\tAIRLINES\tGROUP\tINC.',
 'ITEM\t8B.\tCONSOLIDATED\tFINANCIAL\tSTATEMENTS\tAND\tSUPPLEMENTARY\tDATA\tOF\tAMERICAN\tAIRLINES,\tINC.',
 'ITEM\t9.\t\tCHANGES\tIN\tAND\tDISAGREEMENTS\tWITH\tACCOUNTANTS\tON\tACCOUNTING\tAND\tFINANCIAL\tDISCLOSURE',
 'ITEM\t9A.\t\tCONTROLS\tAND\tPROCEDURES',
 'ITEM\t9B.\t\tOTHER\tINFORMATION',
 'ITEM\t10.

I wonder if this strategy will work for all forms? This would be very handy... Let's see if we can extract the necessary information:

In [21]:
matches = re.compile(r'^ITEM.*1[^A][^A].*BUSINESS')
item7 = [matches.search(x) for x in result]

#rawList.index([-1])
[x for x in rawList if matches.search(x) != None]

['ITEM\t1.\tBUSINESS']

In [22]:
idx7 = rawList.index([x for x in rawList if matches.search(x) != None][-1])

In [23]:
matches = re.compile(r'^ITEM.*8[^B][^B]')
item7 = [matches.search(x) for x in result]
[x for x in rawList if matches.search(x) != None]

['ITEM\t8A.\tCONSOLIDATED\tFINANCIAL\tSTATEMENTS\tAND\tSUPPLEMENTARY\tDATA\tOF\tAMERICAN\tAIRLINES\tGROUP\tINC.']

In [24]:
idx8 = rawList.index([x for x in rawList if matches.search(x) != None][-1])

In [25]:
item7str = ' '.join(rawList[idx7:idx8]).replace('\t', ' ')
#item7str

# Expand to All Filings For One Company

In [33]:
options = { 'quiet': '' }
start = time.time()
item7_company = []
for link in link_dict['0000006201']['Link'].values:
    pdfkit.from_url(link, 'out.pdf', options=options)
    rawText = parser.from_file('out.pdf')
    rawList = rawText['content'].splitlines()
    result = [x for x in rawList if x[:4] == 'ITEM']
    
    try:
        ##### ITEM 7
        matches = re.compile(r'^ITEM.*7[^A][^A]')
        item7 = [matches.search(x) for x in result]
        idx7 = rawList.index([x for x in rawList if matches.search(x) != None][-1])

        ##### ITEM 8
        matches = re.compile(r'^ITEM.*8[^B][^B]')
        item7 = [matches.search(x) for x in result]
        idx8 = rawList.index([x for x in rawList if matches.search(x) != None][-1])

        item7_company.append(' '.join(rawList[idx7:idx8]).replace('\t', ' '))
    except:
        item7_company.append('')
    
end = time.time()

In [34]:
print(end-start)

70.30988907814026


In [35]:
link_dict['0000006201']['Item7'] = item7_company

In [14]:
link_dict['0000006201']

Unnamed: 0,accessionNumber,filingDate,reportDate,acceptanceDateTime,act,form,fileNumber,filmNumber,items,size,isXBRL,isInlineXBRL,primaryDocument,primaryDocDescription,Link,Loc7,Loc8
49,0000006201-21-000014,2021-02-17,2020-12-31,2021-02-17T17:17:57.000Z,34,10-K,001-08400,21646186,,43925703,1,1,aal-20201231.htm,10-K 2020 02.17.21,https://www.sec.gov/Archives/edgar/data/6201/0...,"[[(158292, 158310), Item 7. Management], [(167...","[[(502641, 502662), ITEM 8A. CONSOLIDATED]]"
150,0000006201-20-000023,2020-02-19,2019-12-31,2020-02-19T07:31:30.000Z,34,10-K,001-08400,20627428,,30851334,1,1,a10k123119.htm,10-K 2019 02.19.20,https://www.sec.gov/Archives/edgar/data/6201/0...,"[[(153128, 153146), Item 7. Management], [(156...","[[(414897, 414918), ITEM 8A. CONSOLIDATED]]"
225,0000006201-19-000009,2019-02-25,2018-12-31,2019-02-25T07:31:34.000Z,34,10-K,001-08400,19628071,,30572408,1,0,a10k123118.htm,10-K 2018 02.25.19,https://www.sec.gov/Archives/edgar/data/6201/0...,"[[(9505, 9523), Item 7. Management], [(12796, ...","[[(300867, 300888), ITEM 8A. CONSOLIDATED]]"
315,0000006201-18-000009,2018-02-21,2017-12-31,2018-02-21T08:02:40.000Z,34,10-K,001-08400,18627088,,27914491,1,0,a10k123117.htm,10-K,https://www.sec.gov/Archives/edgar/data/6201/0...,"[[(9554, 9572), Item 7. Management], [(13606, ...","[[(293380, 293401), ITEM 8A. CONSOLIDATED]]"
412,0001193125-17-051216,2017-02-22,2016-12-31,2017-02-22T08:01:43.000Z,34,10-K,001-08400,17627073,,24888480,1,0,d286458d10k.htm,FORM 10-K,https://www.sec.gov/Archives/edgar/data/6201/0...,"[[(9935, 9953), Item 7. Management], [(14047, ...","[[(297249, 297270), ITEM 8A. CONSOLIDATED]]"
538,0001193125-16-474605,2016-02-24,2015-12-31,2016-02-24T08:04:10.000Z,34,10-K,001-08400,161450518,,26170400,1,0,d78287d10k.htm,FORM 10-K,https://www.sec.gov/Archives/edgar/data/6201/0...,"[[(17027, 17045), Item 7. Management], [(21453...","[[(398001, 398022), ITEM 8A. CONSOLIDATED]]"
651,0001193125-15-061145,2015-02-25,2014-12-31,2015-02-25T08:02:34.000Z,34,10-K,001-08400,15645918,,39524925,1,0,d829913d10k.htm,FORM 10-K,https://www.sec.gov/Archives/edgar/data/6201/0...,"[[(16174, 16192), Item 7. Management], [(23008...","[[(452689, 452710), ITEM 8A. CONSOLIDATED]]"
750,0000006201-14-000004,2014-02-28,2013-12-31,2014-02-28T07:52:16.000Z,34,10-K,001-08400,14651496,,47888955,1,0,aagaa10k-20131231.htm,10-K,https://www.sec.gov/Archives/edgar/data/6201/0...,"[[(15590, 15608), Item 7. Management], [(23363...",[]


In [4]:
with open('../Step1-Data/8-link_dict.pickle', 'rb') as f:
    link_dict = pickle.load(f)

# Extend to All Companies

In [9]:
options = { 'quiet': '' }

counter = 0
for company in list(link_dict.keys()):
    
    now = datetime.now()
    print(counter, ' - ', now.strftime("%H:%M:%S"))
    item1_company = []
    item1a_company = []
    item7_company = []
    for link in link_dict[company]['Link'].values:
        try:
            pdfkit.from_url(link, 'out.pdf', options=options)
            
        except:
            item1_company.append('')
            item1a_company.append('')
            item7_company.append('')
            continue
            
        rawText = parser.from_file('out.pdf')
        rawList = rawText['content'].splitlines()
        result = [x for x in rawList if re.match('ITEM', x[:4], re.IGNORECASE)]
        
        try:
            ##### ITEM 1
            matches = re.compile(r'^ITEM.*1[^A].*BUSINESS', re.IGNORECASE)
            #item7 = [matches.search(x) for x in result]
            idx1 = rawList.index([x for x in rawList if matches.search(x) != None][-1])

            ##### ITEM 1A
            matches = re.compile(r'^ITEM.*1(A| A)', re.IGNORECASE)
            #item7 = [matches.search(x) for x in result]
            idx1a = rawList.index([x for x in rawList if matches.search(x) != None][-1])
            
            item1_company.append(' '.join(rawList[idx1:idx1a]).replace('\t', ' '))
        
        except:
            item1_company.append('')
        
        try:
            ##### ITEM 1A
            matches = re.compile(r'^ITEM.*1(A| A)', re.IGNORECASE)
            #item7 = [matches.search(x) for x in result]
            idx1a = rawList.index([x for x in rawList if matches.search(x) != None][-1])

            ##### ITEM 1B
            matches = re.compile(r'^ITEM.*1(B| B)', re.IGNORECASE)
            #item7 = [matches.search(x) for x in result]
            idx1b = rawList.index([x for x in rawList if matches.search(x) != None][-1])
            
            item1a_company.append(' '.join(rawList[idx1a:idx1b]).replace('\t', ' '))
        
        except:
            item1a_company.append('')

        try:
            
            ##### ITEM 7
            matches = re.compile(r'^ITEM.*7[^A][^A]', re.IGNORECASE)
            #item7 = [matches.search(x) for x in result]
            idx7 = rawList.index([x for x in rawList if matches.search(x) != None][-1])

            ##### ITEM 8
            matches = re.compile(r'^ITEM.*8[^B][^B]', re.IGNORECASE)
            #item7 = [matches.search(x) for x in result]
            idx8 = rawList.index([x for x in rawList if matches.search(x) != None][-1])

            item7_company.append(' '.join(rawList[idx7:idx8]).replace('\t', ' '))
        except:
            item7_company.append('')
    
    link_dict[company]['Item1']  = item1_company
    link_dict[company]['Item1a'] = item1a_company
    link_dict[company]['Item7']  = item7_company
    
    if counter % 50 == 0:
        with open('8-link_dict.pickle', 'wb') as handle:
            pickle.dump(link_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
    counter += 1
    
with open('8-link_dict.pickle', 'wb') as handle:
            pickle.dump(link_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

0  -  23:14:08
1  -  23:15:22
2  -  23:15:58
3  -  23:16:36
4  -  23:17:17
5  -  23:17:54
6  -  23:18:19
7  -  23:20:21
8  -  23:21:08
9  -  23:21:38
10  -  23:22:06
11  -  23:23:16
12  -  23:23:48
13  -  23:24:05
14  -  23:24:38
15  -  23:24:54
16  -  23:25:45
17  -  23:26:24
18  -  23:26:54
19  -  23:27:36
20  -  23:28:21
21  -  23:29:20
22  -  23:30:13
23  -  23:30:29
24  -  23:32:38
25  -  23:33:32
26  -  23:36:11
27  -  23:36:36
28  -  23:38:57
29  -  23:38:57
30  -  23:39:25
31  -  23:40:53
32  -  23:41:26
33  -  23:42:36
34  -  23:43:48
35  -  23:44:40
36  -  23:45:16
37  -  23:46:03
38  -  23:46:50
39  -  23:47:57
40  -  23:48:49
41  -  23:49:38
42  -  23:51:38
43  -  23:52:42
44  -  23:53:24
45  -  23:54:18
46  -  23:55:07
47  -  23:56:05
48  -  23:56:59
49  -  23:57:46
50  -  23:58:21
51  -  23:59:19
52  -  00:00:10
53  -  00:01:14
54  -  00:02:25
55  -  00:03:23
56  -  00:03:55
57  -  00:04:49
58  -  00:05:12
59  -  00:06:03
60  -  00:06:47
61  -  00:07:25
62  -  00:08:24
63

489  -  06:06:02
490  -  06:07:03
491  -  06:08:02
492  -  06:09:07
493  -  06:10:48
494  -  06:11:24
495  -  06:12:11
496  -  06:13:24
497  -  06:14:37
498  -  06:15:57
499  -  06:17:16
500  -  06:18:10
501  -  06:19:22
502  -  06:20:14
503  -  06:20:26
504  -  06:22:07
505  -  06:22:56
506  -  06:23:34
507  -  06:24:44
508  -  06:25:43
509  -  06:26:56
510  -  06:27:59
511  -  06:29:05
512  -  06:30:07
513  -  06:30:42
514  -  06:31:31
515  -  06:32:08
516  -  06:32:38
517  -  06:32:52
518  -  06:33:28
519  -  06:33:58
520  -  06:36:19
521  -  06:37:31
522  -  06:38:04
523  -  06:39:02
524  -  06:39:44
525  -  06:40:05
526  -  06:40:09
527  -  06:41:05
528  -  06:41:48
529  -  06:42:17
530  -  06:43:37
531  -  06:44:14
532  -  06:45:12
533  -  06:45:47
534  -  06:46:43
535  -  06:47:39
536  -  06:48:14
537  -  06:49:05
538  -  06:50:15
539  -  06:50:45
540  -  06:51:19
541  -  06:52:36
542  -  06:53:32
543  -  06:54:04
544  -  06:55:10
545  -  06:57:02
546  -  06:57:13
547  -  06:58:

In [18]:
link_dict['0001158449']

Unnamed: 0,accessionNumber,filingDate,reportDate,acceptanceDateTime,act,form,fileNumber,filmNumber,items,size,isXBRL,isInlineXBRL,primaryDocument,primaryDocDescription,Link,Loc7,Loc8,Item7
52,0001158449-21-000036,2021-02-22,2021-01-02,2021-02-22T06:17:48.000Z,34,10-K,001-16797,21658174,,10380786,1,1,aap-20210102.htm,10-K,https://www.sec.gov/Archives/edgar/data/115844...,"[[(92292, 92310), Item 7. Management], [(10599...","[[(135183, 135200), Item 8. Financial]]",Item 7. Management’s Discussion and Analysis o...
179,0001158449-20-000035,2020-02-18,2019-12-28,2020-02-18T16:33:41.000Z,34,10-K,001-16797,20625804,,15615027,1,1,aap10k12282019secreport.htm,10-K,https://www.sec.gov/Archives/edgar/data/115844...,"[[(89201, 89219), Item 7. Management], [(99210...","[[(122041, 122058), Item 8. Financial]]",Item 7. Management’s Discussion and Analysis o...
278,0001158449-19-000043,2019-02-19,2018-12-29,2019-02-19T16:04:12.000Z,34,10-K,001-16797,19614955,,15674736,1,0,aap_10kx12292018secreport.htm,10-K,https://www.sec.gov/Archives/edgar/data/115844...,"[[(66526, 66544), Item 7. Management]]","[[(103131, 103148), Item 8. Financial]]",Item 7. Management’s Discussion and Analysis o...
387,0001158449-18-000039,2018-02-21,2017-12-30,2018-02-21T16:17:10.000Z,34,10-K,001-16797,18628626,,15046063,1,0,aap_10kx12302017.htm,10-K,https://www.sec.gov/Archives/edgar/data/115844...,"[[(64008, 64026), Item 7. Management]]","[[(103875, 103892), Item 8. Financial]]",Item 7. Management’s Discussion and Analysis o...
497,0001158449-17-000034,2017-02-28,2016-12-31,2017-02-28T17:08:31.000Z,34,10-K,001-16797,17649042,,17140639,1,0,aap_10kx12312016.htm,10-K,https://www.sec.gov/Archives/edgar/data/115844...,"[[(84532, 84550), Item 7. Management]]","[[(152790, 152807), Item 8. Financial]]",Item 7. Management’s Discussion and Analysis o...
631,0001158449-16-000299,2016-03-01,2016-01-02,2016-03-01T17:29:32.000Z,34,10-K,001-16797,161474635,,17748770,1,0,aap_10kx122016.htm,10-K,https://www.sec.gov/Archives/edgar/data/115844...,[],"[[(159224, 159241), Item 8. Financial]]",Item 7. Management’s Discussion and Analysis o...
751,0001158449-15-000063,2015-03-03,2015-01-03,2015-03-03T17:23:49.000Z,34,10-K,001-16797,15670482,,20329800,1,0,aap_10kx132015.htm,10-K 1 3 2014,https://www.sec.gov/Archives/edgar/data/115844...,[],"[[(158395, 158412), Item 8. Financial]]",Item 7. Management’s Discussion and Analysis o...
879,0001158449-14-000058,2014-02-25,2013-12-28,2014-02-25T17:13:25.000Z,34,10-K,001-16797,14641709,,20347794,1,0,aap_10kx12282013.htm,10-K,https://www.sec.gov/Archives/edgar/data/115844...,[],"[[(155007, 155024), Item 8. Financial]]",Item 7. Management’s Discussion and Analysis o...


In [9]:
with open('6-link_dict.pickle', 'wb') as handle:
    pickle.dump(link_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [467]:
list(link_dict.keys()).index(company)

765

In [None]:
https://www.sec.gov/Archives/edgar/data/835910/000083591001000012/0000835910-01-000012.txt

In [384]:
for link in link_dict['0000006201']['Link'].values:
    pdfkit.from_url(link, 'out.pdf')
    rawText = parser.from_file('out.pdf')
    rawList = rawText['content'].splitlines()
    result = [x for x in rawList if x[:4] == 'ITEM']
    print(len(result))

Loading pages (1/6)
Printing pages (6/6)


22
Loading pages (1/6)
Counting pages (2/6)                                               
Resolving links (4/6)                                                       
Loading headers and footers (5/6)                                           
Printing pages (6/6)
Done                                                                          
22
Loading pages (1/6)
Counting pages (2/6)                                               
Resolving links (4/6)                                                       
Loading headers and footers (5/6)                                           
Printing pages (6/6)
Done                                                                          
21
Loading pages (1/6)
Counting pages (2/6)                                               
Resolving links (4/6)                                                       
Loading headers and footers (5/6)                                           
Printing pages (6/6)
Done                                        

In [None]:
response = json.loads(requests.get("https://data.sec.gov/submissions/CIK"+cik+".json", headers=heads).text)

In [1]:
badslist = []
for company in companies:
    bads = [len(x) for x in link_dict[company]['Item7'].values]
    if bads.count(0) > 0:
        badslist.append(company)

NameError: name 'companies' is not defined

In [8]:
goodslist = [x for x in link_dict.keys() if x not in badslist]

NameError: name 'badslist' is not defined

In [27]:
company = '0001158449'

In [28]:
now = datetime.now()
print(counter, ' - ', now.strftime("%H:%M:%S"))
item1_company = []
item1a_company = []
item7_company = []
for link in link_dict[company]['Link'].values:
    try:
        pdfkit.from_url(link, 'out.pdf', options=options)

    except:
        item1_company.append('')
        item1a_company.append('')
        item7_company.append('')
        continue

    rawText = parser.from_file('out.pdf')
    rawList = rawText['content'].splitlines()
    result = [x for x in rawList if re.match('ITEM', x[:4], re.IGNORECASE)]

    try:
        ##### ITEM 1
        matches = re.compile(r'^ITEM.*1[^A][^A].*BUSINESS', re.IGNORECASE)
        #item7 = [matches.search(x) for x in result]
        idx1 = rawList.index([x for x in rawList if matches.search(x) != None][-1])

        ##### ITEM 1A
        matches = re.compile(r'^ITEM.*1(A| A)', re.IGNORECASE)
        #item7 = [matches.search(x) for x in result]
        idx1a = rawList.index([x for x in rawList if matches.search(x) != None][-1])

        item1_company.append(' '.join(rawList[idx1:idx1a]).replace('\t', ' '))

    except:
        item1_company.append('')

    try:
        ##### ITEM 1A
        matches = re.compile(r'^ITEM.*1(A| A)', re.IGNORECASE)
        #item7 = [matches.search(x) for x in result]
        idx1a = rawList.index([x for x in rawList if matches.search(x) != None][-1])

        ##### ITEM 1B
        matches = re.compile(r'^ITEM.*1(B| B)', re.IGNORECASE)
        #item7 = [matches.search(x) for x in result]
        idx1b = rawList.index([x for x in rawList if matches.search(x) != None][-1])

        item1a_company.append(' '.join(rawList[idx1a:idx1b]).replace('\t', ' '))

    except:
        item1a_company.append('')

    try:

        ##### ITEM 7
        matches = re.compile(r'^ITEM.*7[^A][^A]', re.IGNORECASE)
        #item7 = [matches.search(x) for x in result]
        idx7 = rawList.index([x for x in rawList if matches.search(x) != None][-1])

        ##### ITEM 8
        matches = re.compile(r'^ITEM.*8[^B][^B]', re.IGNORECASE)
        #item7 = [matches.search(x) for x in result]
        idx8 = rawList.index([x for x in rawList if matches.search(x) != None][-1])

        item7_company.append(' '.join(rawList[idx7:idx8]).replace('\t', ' '))
    except:
        item7_company.append('')

link_dict[company]['Item1']  = item1_company
link_dict[company]['Item1a'] = item1a_company
link_dict[company]['Item7']  = item7_company

23  -  20:51:26


ValueError: Length of values (1) does not match length of index (8)

result

In [30]:
result

['Item\t1.\t\t\t\tBusiness.',
 'Item\t1A.\tRisk\tFactors.',
 'Item\t1B.\tUnresolved\tStaff\tComments.',
 'Item\t2.\tProperties.',
 'Item\t3.\tLegal\tProceedings.',
 'Item\t4.\tMine\tSafety\tDisclosures.',
 'Item\t5. Market\t for\t Registrant’s\t Common\t Equity,\t Related\t Stockholder\t Matters\t and\t Issuer\t Purchases\t of\t Equity',
 'Item\t6.\t\t\t\tSelected\tConsolidated\tFinancial\tData.',
 'Item\t7.\tManagement’s\tDiscussion\tand\tAnalysis\tof\tFinancial\tCondition\tand\tResults\tof\tOperations.',
 'Item\t7A.\tQuantitative\tand\tQualitative\tDisclosures\tabout\tMarket\tRisks.',
 'Item\t8.\tFinancial\tStatements\tand\tSupplementary\tData.',
 'Item\t9.\tChanges\tin\tand\tDisagreements\twith\tAccountants\ton\tAccounting\tand\tFinancial\tDisclosure.',
 'Item\t9A.\tControls\tand\tProcedures.',
 'Item\t9B.\tOther\tInformation.',
 'Item\t10.\tDirectors,\tExecutive\tOfficers\tand\tCorporate\tGovernance.',
 'Item\t11.\tExecutive\tCompensation.',
 'Item\t12.\tSecurity\tOwnership\tof\tCe

In [38]:
##### ITEM 1
matches = re.compile(r'^ITEM.*1.*BUSINESS', re.IGNORECASE)
#item7 = [matches.search(x) for x in result]
idx1 = rawList.index([x for x in rawList if matches.search(x) != None][-1])

##### ITEM 1A
matches = re.compile(r'^ITEM.*1(A| A)', re.IGNORECASE)
#item7 = [matches.search(x) for x in result]
idx1a = rawList.index([x for x in rawList if matches.search(x) != None][-1])

item1_company.append(' '.join(rawList[idx1:idx1a]).replace('\t', ' '))

In [40]:
idx1

203