# Document Parsing

In this notebook, we will be iterating through all the links created in file 2.2. (i.e. link_dict.pickle), extracting the necessary information, and then saving an updated dictionary, which will be analyzed in section 3. 

In [64]:
import numpy as np
import pandas as pd
import os
import pickle
import time
import json
import requests
import re

from bs4 import BeautifulSoup

In [4]:
with open('../Step1-Data/2-link_dict.pickle', 'rb') as f:
    link_dict = pickle.load(f)

In [11]:
# Sanity check
list(link_dict.keys())[:5]

['0000006201', '0001158449', '0000320193', '0001551152', '0001140859']

# Single Name Sample

First, we will start by using a single name to parse its documents, then extrapolate to all.

In [12]:
sample = link_dict['0000006201']
sample

Unnamed: 0,accessionNumber,filingDate,reportDate,acceptanceDateTime,act,form,fileNumber,filmNumber,items,size,isXBRL,isInlineXBRL,primaryDocument,primaryDocDescription,Link
49,0000006201-21-000014,2021-02-17,2020-12-31,2021-02-17T17:17:57.000Z,34,10-K,001-08400,21646186,,43925703,1,1,aal-20201231.htm,10-K 2020 02.17.21,https://www.sec.gov/Archives/edgar/data/6201/0...
150,0000006201-20-000023,2020-02-19,2019-12-31,2020-02-19T07:31:30.000Z,34,10-K,001-08400,20627428,,30851334,1,1,a10k123119.htm,10-K 2019 02.19.20,https://www.sec.gov/Archives/edgar/data/6201/0...
225,0000006201-19-000009,2019-02-25,2018-12-31,2019-02-25T07:31:34.000Z,34,10-K,001-08400,19628071,,30572408,1,0,a10k123118.htm,10-K 2018 02.25.19,https://www.sec.gov/Archives/edgar/data/6201/0...
315,0000006201-18-000009,2018-02-21,2017-12-31,2018-02-21T08:02:40.000Z,34,10-K,001-08400,18627088,,27914491,1,0,a10k123117.htm,10-K,https://www.sec.gov/Archives/edgar/data/6201/0...
412,0001193125-17-051216,2017-02-22,2016-12-31,2017-02-22T08:01:43.000Z,34,10-K,001-08400,17627073,,24888480,1,0,d286458d10k.htm,FORM 10-K,https://www.sec.gov/Archives/edgar/data/6201/0...
538,0001193125-16-474605,2016-02-24,2015-12-31,2016-02-24T08:04:10.000Z,34,10-K,001-08400,161450518,,26170400,1,0,d78287d10k.htm,FORM 10-K,https://www.sec.gov/Archives/edgar/data/6201/0...
651,0001193125-15-061145,2015-02-25,2014-12-31,2015-02-25T08:02:34.000Z,34,10-K,001-08400,15645918,,39524925,1,0,d829913d10k.htm,FORM 10-K,https://www.sec.gov/Archives/edgar/data/6201/0...
750,0000006201-14-000004,2014-02-28,2013-12-31,2014-02-28T07:52:16.000Z,34,10-K,001-08400,14651496,,47888955,1,0,aagaa10k-20131231.htm,10-K,https://www.sec.gov/Archives/edgar/data/6201/0...


In [271]:
sample_link = sample['Link'].values[0]
sample_link

'https://www.sec.gov/Archives/edgar/data/1555280/000155528021000098/zts-20201231.htm'

In [279]:
sample[sample['reportDate'].values > '1996-01-01']

Unnamed: 0,accessionNumber,filingDate,reportDate,acceptanceDateTime,act,form,fileNumber,filmNumber,items,size,isXBRL,isInlineXBRL,primaryDocument,primaryDocDescription,Link,Loc7,Loc8
90,0001555280-21-000098,2021-02-16,2020-12-31,2021-02-16T14:24:47.000Z,34,10-K,001-35797,21636290,,18961290,1,1,zts-20201231.htm,10-K,https://www.sec.gov/Archives/edgar/data/155528...,<callable_iterator object at 0x7fc269c1bcd0>,<callable_iterator object at 0x7fc27d64d350>
277,0001555280-20-000054,2020-02-13,2019-12-31,2020-02-13T15:13:44.000Z,34,10-K,001-35797,20609808,,21406402,1,1,zoetis-20191231x10kye.htm,10-K,https://www.sec.gov/Archives/edgar/data/155528...,<callable_iterator object at 0x7fc34926f410>,<callable_iterator object at 0x7fc269c25590>
428,0001555280-19-000041,2019-02-14,2018-12-31,2019-02-14T17:08:33.000Z,34,10-K,001-35797,19608112,,19494677,1,0,zoetis-20181231x10kye.htm,10-K,https://www.sec.gov/Archives/edgar/data/155528...,<callable_iterator object at 0x7fc346aaa450>,<callable_iterator object at 0x7fc34926fdd0>
572,0001555280-18-000053,2018-02-15,2017-12-31,2018-02-15T12:31:15.000Z,34,10-K,001-35797,18616883,,18267949,1,0,zoetis-20171231x10kye.htm,10-K,https://www.sec.gov/Archives/edgar/data/155528...,<callable_iterator object at 0x7fc2ae834410>,<callable_iterator object at 0x7fc346aaa850>
683,0001555280-17-000044,2017-02-16,2016-12-31,2017-02-16T16:26:45.000Z,34,10-K,001-35797,17618235,,18284297,1,0,zoetis-20161231x10kye.htm,10-K,https://www.sec.gov/Archives/edgar/data/155528...,<callable_iterator object at 0x7fc2db85e3d0>,<callable_iterator object at 0x7fc2ae834d90>
783,0001555280-16-000344,2016-02-24,2015-12-31,2016-02-24T11:32:51.000Z,34,10-K,001-35797,161450905,,19416727,1,0,zoetis-20151231x10kye.htm,10-K,https://www.sec.gov/Archives/edgar/data/155528...,<callable_iterator object at 0x7fc3601c3b10>,<callable_iterator object at 0x7fc2db85ee10>
927,0001555280-15-000057,2015-02-27,2014-12-31,2015-02-27T16:52:38.000Z,34,10-K,001-35797,15660089,,24724525,1,0,zoetis-20141231x10kye.htm,10-K,https://www.sec.gov/Archives/edgar/data/155528...,<callable_iterator object at 0x7fc2ee317f90>,<callable_iterator object at 0x7fc3601c3490>


In [26]:
# Used for the requests
heads = {#'Host': 'www.sec.gov', 
         #'Connection': 'close',
         'Accept': 'application/json',#, text/javascript, */*; q=0.01', 
         'X-Requested-With': 'XMLHttpRequest',
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
         }

In [54]:
response = requests.get(sample_link, headers=heads).text

In [55]:
response[:500]

'<?xml version="1.0" ?><!--XBRL Document Created with Wdesk from Workiva--><!--Copyright 2021 Workiva--><!--r:e284921f-ba97-4934-aa95-4c28f5345f62,g:0d6c1504-a6c8-4292-b9c4-47751639beae,d:218984ca4fa54d3589b92be28c18f351--><html xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:srt="http://fasb.org/srt/2020-01-31" xmlns:iso4217="http://www.xbrl.org/2003/iso4217" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/1999/xhtml" xmlns:xbrldi="http://xbrl.org/2006/xbrldi" xml'

In [58]:
soup = BeautifulSoup(response, 'lxml')

In [148]:
cleaned_soup = soup.text

In [149]:
raw_10k = str(soup)

In [270]:
locs7 = re.finditer(r"ITEM\s7(|\.) Management", cleaned_soup, re.IGNORECASE)
[[x.span(), x.group()] for x in locs7]

[<re.Match object; span=(183890, 183908), match='Item 7. Management'>,
 <re.Match object; span=(185116, 185134), match='Item 7. Management'>,
 <re.Match object; span=(186022, 186040), match='Item 7. Management'>,
 <re.Match object; span=(186422, 186440), match='Item 7. Management'>,
 <re.Match object; span=(187030, 187048), match='Item 7. Management'>]

In [103]:
locs8 = re.finditer(r"ITEM\s8(|A)(|\.)\s(Consolidated|Financial)", cleaned_soup, re.IGNORECASE)
[x for x in locs8]

[<re.Match object; span=(502641, 502662), match='ITEM\xa08A.\xa0CONSOLIDATED'>]

I wonder if maybe it would be best to use some sort of clustering algorithm to find the correct locations? 

In [280]:
start = time.time()
link_dict3 = {}

for key in list(link_dict.keys()):

    loc7list = []
    loc8list = []
    
    sample = link_dict[key].copy()
    sample = sample[sample['reportDate'].values > '1996-01-01']
    for i in range(len(sample)):
        sample_link = sample.iloc[i,:]['Link']
        response = requests.get(sample_link, headers=heads).text
        soup = BeautifulSoup(response, 'lxml')
        cleaned_soup = soup.text
        locs7 = re.finditer(r"ITEM\s7(|\.)\sManagement", cleaned_soup, re.IGNORECASE)
        locs8 = re.finditer(r"ITEM\s8(|A)(|\.)\s(Consolidated|Financial)", cleaned_soup, re.IGNORECASE)

        loc7list.append([[x.span(), x.group()] for x in locs7])
        loc8list.append([[x.span(), x.group()] for x in locs8])
    
    sample['Loc7'] = loc7list
    sample['Loc8'] = loc8list
    
    link_dict3[key] = sample.copy()

end = time.time()

In [284]:
with open('2-link_dict.pickle', 'wb') as handle:
    pickle.dump(link_dict3, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [282]:
print(end-start)

13508.377779006958


In [283]:
len(link_dict3.keys())

873

In [214]:
((end-start) * 1110)/5 /60/60

6.319669485092163

In [286]:
[x for x in link_dict3['0001158449']['Loc7'].copy().values[0]]

[[(92292, 92310), 'Item 7. Management'],
 [(105990, 106008), 'Item 7. Management']]

# Find Closest Pair of Points

