# Objective
To discover insights and relationships from insider trading filings

In [None]:
# Essential imports and constants
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from bs4 import BeautifulSoup as bs
import requests, time, datetime, re

# if executed locally
TEST = 'Rationale/test.csv'
FORM4_LINK = 'Database/form4_data.csv'
FORM4_LINK_XML = 'Database/form4_data_xml.csv'
FORM4_LINK_HTM = 'Database/form4_data_htm.csv'
FORM4_LINK_TXT = 'Database/form4_data_txt.csv'
FORM4_DETAIL = 'Database/form4_detail.csv' 
FORM4_DETAIL_XML = 'Database/form4_detail_xml.csv'
FORM4_DETAIL_HTM = 'Database/form4_detail_htm.csv'
FORM4_DETAIL_TXT = 'Database/form4_detail_txt.csv'
FORM4_TX = 'Database/form4_tx.csv'
FORM4_TX_XML = 'Database/form4_tx_xml.csv'
FORM4_TX_HTM = 'Database/form4_tx_htm.csv'
FORM4_TX_TXT = 'Database/form4_tx_txt.csv'
META_DJ30 = 'Metadata/ticker_dj30.txt' 
META_SP500 = 'Metadata/ticker_sp500.txt'

In [None]:
# if executed on Google Colab
FORM4_LINK = '/content/drive/My Drive/URECA/Database/form4_data.csv'
FORM4_LINK_XML = '/content/drive/My Drive/URECA/Database/form4_data_xml.csv'
FORM4_LINK_HTM = '/content/drive/My Drive/URECA/Database/form4_data_htm.csv'
FORM4_LINK_TXT = '/content/drive/My Drive/URECA/Database/form4_data_txt.csv'
FORM4_DETAIL = '/content/drive/My Drive/URECA/Database/form4_detail.csv'
FORM4_DETAIL_XML = '/content/drive/My Drive/URECA/Database/form4_detail_xml.csv'
FORM4_DETAIL_HTM = '/content/drive/My Drive/URECA/Database/form4_detail_htm.csv'
FORM4_DETAIL_TXT = '/content/drive/My Drive/URECA/Database/form4_detail_txt.csv'
FORM4_TX = '/content/drive/My Drive/URECA/Database/form4_tx.csv'
FORM4_TX_XML = '/content/drive/My Drive/URECA/Database/form4_tx_xml.csv'
FORM4_TX_HTM = '/content/drive/My Drive/URECA/Database/form4_tx_htm.csv'
FORM4_TX_TXT = '/content/drive/My Drive/URECA/Database/form4_tx_txt.csv'
META_DJ30 = '/content/drive/My Drive/URECA/Metadata/ticker_dj30.txt' 
META_SP500 = '/content/drive/My Drive/URECA/Metadata/ticker_sp500.txt'
TEST = '/content/drive/My Drive/URECA/Rationale/test.csv'

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# snippet to download files
from google.colab import files
files.download('form4.detail.csv')  # example

ModuleNotFoundError: No module named 'google.colab'

In [None]:
%%bash
# and save in Google Drive
cp -r form4_detail.csv '/content/drive/My Drive/URECA'
cp -r tee_head_time.csv '/content/drive/My Drive/URECA'
cp -r test.csv '/content/drive/My Drive/URECA'

# Methodology

## Rationale for Using GNU Parallel
Experimenting with GNU Parallel. For SEC EDGAR file, only 10 requests per second are allowed. To test GNU Parallel, https://httpbin.org/#/Dynamic_data is used. 10 API calls that will delay 5 seconds are launched by curl in parallel. The results are obtained in 6s, which is a good estimate that GNU parallel is functioning.

In [None]:
%%bash
seq 10 | time parallel -j10 'curl -X GET "https://httpbin.org/delay/5" -H "accept: application/json"'

{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {}, 
  "headers": {
    "Accept": "application/json", 
    "Host": "httpbin.org", 
    "User-Agent": "curl/7.68.0", 
    "X-Amzn-Trace-Id": "Root=1-60365028-2c22436a65bb09cd017d013f"
  }, 
  "origin": "155.69.175.63", 
  "url": "https://httpbin.org/delay/5"
}
{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {}, 
  "headers": {
    "Accept": "application/json", 
    "Host": "httpbin.org", 
    "User-Agent": "curl/7.68.0", 
    "X-Amzn-Trace-Id": "Root=1-60365028-6b55be27021de3d366bd4b5c"
  }, 
  "origin": "155.69.175.63", 
  "url": "https://httpbin.org/delay/5"
}
{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {}, 
  "headers": {
    "Accept": "application/json", 
    "Host": "httpbin.org", 
    "User-Agent": "curl/7.68.0", 
    "X-Amzn-Trace-Id": "Root=1-60365028-5e141fe6713b019911066cc7"
  }, 
  "origin": "155.69.175.63", 
  "url": "https://httpbin.org/delay/5"
}
{
  "args": {}, 
  "data": "", 
  "files": {},

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0  0     0    0     0    0     0      0      0 --:--:--  0:00:02 --:--:--     0  0     0    0     0    0     0      0      0 --:--:--  0:00:03 --:--:--     0  0     0    0     0    0     0      0      0 --:--:--  0:00:05 --:--:--     0  0     0    0     0    0     0      0      0 --:--:--  0:00:06 --:--:--     0100   318  100   318    0     0     52      0  0:00:06  0:00:06 --:--:--    78
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:-

CalledProcessError: Command 'b'seq 10 | time parallel -j10 \'curl -X GET "https://httpbin.org/delay/5" -H "accept: application/json"\'\n'' returned non-zero exit status 10.

As an extension, API calls that will have delays from 1 to 15 seconds are made. With GNU Parallel, curl obtains the data in 18 seconds, showing that it does exhibit parallel behaviour.

In [None]:
%%bash
seq 15 | time parallel -j10 'curl -X GET "https://httpbin.org/delay/{}" -H "accept: application/json" -N'

{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {}, 
  "headers": {
    "Accept": "application/json", 
    "Host": "httpbin.org", 
    "User-Agent": "curl/7.68.0", 
    "X-Amzn-Trace-Id": "Root=1-60364d76-294f327a183efe5e4fddec92"
  }, 
  "origin": "155.69.175.63", 
  "url": "https://httpbin.org/delay/1"
}
{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {}, 
  "headers": {
    "Accept": "application/json", 
    "Host": "httpbin.org", 
    "User-Agent": "curl/7.68.0", 
    "X-Amzn-Trace-Id": "Root=1-60364d76-751ac2a11a21f0ec69fdf1b4"
  }, 
  "origin": "155.69.175.63", 
  "url": "https://httpbin.org/delay/2"
}
{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {}, 
  "headers": {
    "Accept": "application/json", 
    "Host": "httpbin.org", 
    "User-Agent": "curl/7.68.0", 
    "X-Amzn-Trace-Id": "Root=1-60364d76-5c87682d567d70e5040f4282"
  }, 
  "origin": "155.69.175.63", 
  "url": "https://httpbin.org/delay/3"
}
{
  "args": {}, 
  "data": "", 
  "files": {},

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0  0     0    0     0    0     0      0      0 --:--:--  0:00:02 --:--:--     0100   318  100   318    0     0    155      0  0:00:02  0:00:02 --:--:--   155
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0100   318  100   318    0     0    106      0  0:00:03  0:00:02  0:00:01   106100   318  100   318    0     0    105      0  0:0

`curl` can be configured to not buffer the outputs while the execution is ongoing with `-N` option

In [None]:
%%bash
curl -X GET "https://httpbin.org/drip?duration=2&numbytes=10&code=200&delay=2" -H "accept: application/octet-stream" -N

**********

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:--  0:00:02 --:--:--     0 10    10    0     1    0     0      0      0 --:--:--  0:00:03 --:--:--     0 40    10   40     4    0     0      1      0  0:00:10  0:00:03  0:00:07     1 90    10   90     9    0     0      1      0  0:00:10  0:00:04  0:00:06     1100    10  100    10    0     0      2      0  0:00:05  0:00:04  0:00:01     2


## Rationale for Using Scripting
In handling Form 4 files from 1993 to 2021, files will be at least 2GB. However, not all data are needed for data processing. To extract the right data for analysis, Linux script is used. 
  
The typical way of handling data would be to read in the data files and process them by using Pandas. However, this process may not be suitable for large files due to memory and time complexities (paging will create bottleneck in bringing into the physical memory)

The script below generates a csv file with 2.55GB data, where there are 23 columns and 50 million rows of data. Each row of data is 51 bytes. The file size is the typical file size of all Form 4 files in SEC EDGAR

In [None]:
%%bash
# Add header for easier reference
echo "digit_1,mul2,digit_2,digit_3,digit_4,digit_5,mod3,mod5,doubledigit_1,digit_6,digit_7,digit_8,mod4,doubledigit_2,digit_9,digit_10,digit_11,mod7,triple,doubledigit_3,digit12,digit13,digit14" | tee test.csv
for i in $(seq 1 1 50000000); do 
    tmp=$(($i%10));
    echo $tmp,$(($(($i*2))%10)),$tmp,$tmp,$tmp,$tmp,$(($tmp % 3)),$(($tmp % 5)),$tmp$tmp,$tmp,$tmp,$tmp,$(($tmp % 4)),$tmp$tmp,$tmp,$tmp,$tmp,$(($tmp % 7)),$tmp$tmp$tmp,$tmp$tmp,$tmp,$tmp,$(($tmp % 4)); 
done | tee -a test.csv | awk 'FNR < 15'

# using awk instead of head as it is asynchronous [https://unix.stackexchange.com/questions/47932/how-do-i-use-tee-to-redirect-to-grep]
# command for awk [https://www.unix.com/shell-programming-and-scripting/82416-printing-first-n-lines-file-without-using-head.html]

digit_1,mul2,digit_2,digit_3,digit_4,digit_5,mod3,mod5,doubledigit_1,digit_6,digit_7,digit_8,mod4,doubledigit_2,digit_9,digit_10,digit_11,mod7,triple,doubledigit_3,digit12,digit13,digit14
1,2,1,1,1,1,1,1,11,1,1,1,1,11,1,1,1,1,111,11,1,1,1
2,4,2,2,2,2,2,2,22,2,2,2,2,22,2,2,2,2,222,22,2,2,2
3,6,3,3,3,3,0,3,33,3,3,3,3,33,3,3,3,3,333,33,3,3,3
4,8,4,4,4,4,1,4,44,4,4,4,0,44,4,4,4,4,444,44,4,4,0
5,0,5,5,5,5,2,0,55,5,5,5,1,55,5,5,5,5,555,55,5,5,1
6,2,6,6,6,6,0,1,66,6,6,6,2,66,6,6,6,6,666,66,6,6,2
7,4,7,7,7,7,1,2,77,7,7,7,3,77,7,7,7,0,777,77,7,7,3
8,6,8,8,8,8,2,3,88,8,8,8,0,88,8,8,8,1,888,88,8,8,0
9,8,9,9,9,9,0,4,99,9,9,9,1,99,9,9,9,2,999,99,9,9,1
0,0,0,0,0,0,0,0,00,0,0,0,0,00,0,0,0,0,000,00,0,0,0
1,2,1,1,1,1,1,1,11,1,1,1,1,11,1,1,1,1,111,11,1,1,1
2,4,2,2,2,2,2,2,22,2,2,2,2,22,2,2,2,2,222,22,2,2,2
3,6,3,3,3,3,0,3,33,3,3,3,3,33,3,3,3,3,333,33,3,3,3
4,8,4,4,4,4,1,4,44,4,4,4,0,44,4,4,4,4,444,44,4,4,0


Consider loading the whole dataset into csv file just to get those rows with 'triple' = 111

For Pandas, the naive approach would be to load the whole dataset into a variable `df`, and then use Boolean operations and masking to obtain the required data.

In [None]:
%%time
data = pd.read_csv(TEST, header = 0)
df = data.loc[data['triple']==111, :]
display(df)

MemoryError: Unable to allocate 8.57 GiB for an array with shape (23, 50000000) and data type int64

`MemoryError` occured as the physical memory on the local machine is insufficient for the whole dataset. An approach to circumvent this issue is to load the file in chunks or to use appropriate data types to add

In [None]:
a = datetime.datetime.now()
df_full = pd.DataFrame()
for chunk in pd.read_csv(TEST, header = 0, chunksize = 100000):
    df = chunk.loc[(chunk['triple'] == 111), :]
    df_full = pd.concat([df_full, df], axis = 0)
b = datetime.datetime.now()
display(df_full)
print("Total Time Taken:", (b-a).total_seconds())

Unnamed: 0,digit_1,mul2,digit_2,digit_3,digit_4,digit_5,mod3,mod5,doubledigit_1,digit_6,...,doubledigit_2,digit_9,digit_10,digit_11,mod7,triple,doubledigit_3,digit12,digit13,digit14
0,1,2,1,1,1,1,1,1,11,1,...,11,1,1,1,1,111,11,1,1,1
10,1,2,1,1,1,1,1,1,11,1,...,11,1,1,1,1,111,11,1,1,1
20,1,2,1,1,1,1,1,1,11,1,...,11,1,1,1,1,111,11,1,1,1
30,1,2,1,1,1,1,1,1,11,1,...,11,1,1,1,1,111,11,1,1,1
40,1,2,1,1,1,1,1,1,11,1,...,11,1,1,1,1,111,11,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49999950,1,2,1,1,1,1,1,1,11,1,...,11,1,1,1,1,111,11,1,1,1
49999960,1,2,1,1,1,1,1,1,11,1,...,11,1,1,1,1,111,11,1,1,1
49999970,1,2,1,1,1,1,1,1,11,1,...,11,1,1,1,1,111,11,1,1,1
49999980,1,2,1,1,1,1,1,1,11,1,...,11,1,1,1,1,111,11,1,1,1


Total Time Taken: 131.005842


To process 2.5GB of data, Pandas took approximately 2 minutes 10 seconds (131.005842 seconds). Consider using bash script with awk, which streams and processes data line by line

In [None]:
%%bash
time awk 'BEGIN {FS=","} {if (($19 == 111))
    print $0;
}' 'Rationale/test.csv' >buffer.csv


real	0m45.338s
user	0m42.406s
sys	0m2.859s


In [None]:
a = datetime.datetime.now()
df_all_bash = pd.read_csv('buffer.csv', header = 0)
display(df_all_bash)
b = datetime.datetime.now()
print((b-a).total_seconds())

Unnamed: 0,1,2,1.1,1.2,1.3,1.4,1.5,1.6,11,1.7,...,11.1,1.11,1.12,1.13,1.14,111,11.2,1.15,1.16,1.17
0,1,2,1,1,1,1,1,1,11,1,...,11,1,1,1,1,111,11,1,1,1
1,1,2,1,1,1,1,1,1,11,1,...,11,1,1,1,1,111,11,1,1,1
2,1,2,1,1,1,1,1,1,11,1,...,11,1,1,1,1,111,11,1,1,1
3,1,2,1,1,1,1,1,1,11,1,...,11,1,1,1,1,111,11,1,1,1
4,1,2,1,1,1,1,1,1,11,1,...,11,1,1,1,1,111,11,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4999994,1,2,1,1,1,1,1,1,11,1,...,11,1,1,1,1,111,11,1,1,1
4999995,1,2,1,1,1,1,1,1,11,1,...,11,1,1,1,1,111,11,1,1,1
4999996,1,2,1,1,1,1,1,1,11,1,...,11,1,1,1,1,111,11,1,1,1
4999997,1,2,1,1,1,1,1,1,11,1,...,11,1,1,1,1,111,11,1,1,1


6.468974


Adding the processing and loading time of the script approach yields less than 55 seconds, compared with 131 seconds by using Pandas approach. The script approach takes less than 45% of the time taken by Pandas approach.

This quick example illustrates the need for pre-filtering of data instead of loading the whole dataset into the physical memory. Although there are other variables that influence the ability to load DataFrames (eg. data types used to store each datum, number of processors on the machine), this example is intended to just explain the rationale of using scripting to filter data

An alternative method is to store the data in SQL database and filter the necessary data before reading them. The following example shows this approach by using MySQL

In [None]:
%%time
engine = create_engine('mysql+pymysql://{}:{}@localhost/ureca'.format(os.environ(MySQLPass)))
dtypes = {'digit_1': 'int8','mul2': 'int8',
'digit_2': 'int8', 'digit_3': 'int8', 'digit_4': 'int8', 'digit_5': 'int8',
'mod3': 'int8', 'mod5': 'int8', 'doubledigit_1': 'int8', 'digit_6': 'int8', 'digit_7': 'int8', 'digit_8': 'int8',
'mod4': 'int8', 'doubledigit_2': 'int8', 'digit_9':'int8', 'digit_10': 'int8', 'digit_11': 'int8', 'mod7': 'int8',
'triple': 'int8', 'doubledigit_3': 'int8', 'digit12': 'int8', 'digit13': 'int8', 'digit14': 'int8'}

for chunk in pd.read_csv('Rationale/test.csv', header = 0, chunksize = 1000000, dtype = dtypes):
    chunk.to_sql('rationale_use_script', engine, index=False, if_exists = 'append')

Wall time: 1h 17min 26s


In [None]:
# Verify that all data are transferred
query = """
SELECT COUNT(*)
FROM rationale_use_script;
"""
count = pd.read_sql_query(query, engine)
print(count)

   COUNT(*)
0  50000000


In [None]:
import datetime
a = datetime.datetime.now()
engine = create_engine('mysql+pymysql://oong:{}@localhost/ureca'.format(os.environ(MySQLPass)))
query = """
SELECT *
FROM rationale_use_script
WHERE triple=111;
"""
df_full_sql = pd.read_sql_query(query, engine)
display(df_full_sql)
b = datetime.datetime.now()
print("Total Time Taken:", (b-a).total_seconds())

Unnamed: 0,digit_1,mul2,digit_2,digit_3,digit_4,digit_5,mod3,mod5,doubledigit_1,digit_6,...,doubledigit_2,digit_9,digit_10,digit_11,mod7,triple,doubledigit_3,digit12,digit13,digit14
0,1,2,1,1,1,1,1,1,11,1,...,11,1,1,1,1,111,11,1,1,1
1,1,2,1,1,1,1,1,1,11,1,...,11,1,1,1,1,111,11,1,1,1
2,1,2,1,1,1,1,1,1,11,1,...,11,1,1,1,1,111,11,1,1,1
3,1,2,1,1,1,1,1,1,11,1,...,11,1,1,1,1,111,11,1,1,1
4,1,2,1,1,1,1,1,1,11,1,...,11,1,1,1,1,111,11,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4999995,1,2,1,1,1,1,1,1,11,1,...,11,1,1,1,1,111,11,1,1,1
4999996,1,2,1,1,1,1,1,1,11,1,...,11,1,1,1,1,111,11,1,1,1
4999997,1,2,1,1,1,1,1,1,11,1,...,11,1,1,1,1,111,11,1,1,1
4999998,1,2,1,1,1,1,1,1,11,1,...,11,1,1,1,1,111,11,1,1,1


Total Time Taken: 194.846809


Notice that the SQL approach takes approximately 3 minutes 15 seconds (194.846809 seconds). 
  
In summary, 

|Approach | Time (rounded to nearest sec)| Approximate Ratio
--- | --- | ---
CSV | 131 | 2.52
Script | 52 | 1
SQL | 195 | 3.75

# Data Collection
To obtain data, three major actions are taken:  
1. Determine the Approach to scrap the Form 4 files by obtaining them in XML format
2. Determine the target companies to explore and scrap
3. Scrap the information from the Form 4 files

## Scraping XML Links
Insider trading filings are kept in US Securities and Exchange Commission (SEC) in Electronic Data Gathering, Analysis, and Retrieval system (EDGAR), the primary system for companies and other entities to submit documents [1](https://www.sec.gov/edgar/about). Since all insider trading activities must be reported to SEC EDGAR via Form 4, insider trading analysis can be performed by scraping the website.
  
SEC EDGAR website compiles data in the following way:
- The Form 4 files since 1993 are stored in SEC EDGAR. 
- The Form 4 files are compiled into a folder each quarter of the year. To locate all of these files, EDGAR uses an index `idx` file to track the reporting companies, dates and links regarding each Form 4 file.
Hence, to quicken the data collection process, the index files will be scraped by substituting the year and quarter into the link, and obtaining the link by cURL utility. The website link has the pattern   
`https://www.sec.gov/Archives/edgar/full-index/[filing year]/QTR[filing quarter]/form.idx`
  
However, SEC EDGAR has a traffic limit of 10 requests per second. Exceeding traffic limit will cause IP block for 10 minutes. While the offered traffic limit is decent, it takes around 500 seconds to scrap just one quarter (if the quarter is the first or second quarter). Parallel execution is preferred.
  
Scraping of XML files is done in `extract.sh`. Note that a string parameter containing CIKs separated by '|' is passed into the file for filtering.

In [None]:
%%bash
cat extract.sh

#!/bin/sh
read CIKs;
for i in $(seq 1993 1 2020); 
do
    for j in $(seq 1 1 4);
    do
        idx_file="https://www.sec.gov/Archives/edgar/full-index/$i/QTR$j/form.idx"
        if curl -o /dev/null --silent --fail --head $idx_file; then
            echo "Reading idx file for $i-QTR$j"
            SECONDS=0;
            dir_date=$(echo $idx_file | sed -rne "s|.*([0-9]{4})/QTR([1-4]).*|\1-QTR\2|p")
            curl -s $idx_file | grep -E "^4[[:space:]]" | grep -Ew $CIKs |
            awk -v home_link="https://www.sec.gov/Archives/" 'BEGIN{OFS=":"; ORS="\n"}
                {for(i=2;i<NF-2;i++) printf("%s ", $(i))
                print "", $(NF-2), $(NF - 1), home_link$(NF)}' |
            while IFS=: read -r company cik date link; do
                sleep 0.1000
                xml_file=`curl -s $link | sed -ne '0,/<FILENAME>/s/<FILENAME>\(.*\)/\1/p' `
                temp=`echo $link | sed -e 's/-//g'`
                new_link=`echo $temp | sed -e "s|.txt|/$xml_file|"`
               

## Determine the Target Companies
This exploration focuses on Dow Jones 30 companies. 
- As each company has a corresponding Central Index Key (CIK) and a Ticker Symbol, [ticker data](https://www.sec.gov/include/ticker.txt) is obtained from SEC EDGAR to obtain information about these companies.
- To determine a clearer company domain to explore, Dow Jones 30 and S&P 500 companies are explored
    - Dow Jones 30 companies are obtained from [Wikipedia](https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average)
    - S&P 500 companies are obtained from [Wikipedia](https://en.wikipedia.org/wiki/List_of_S%26P_500_companies)

In [None]:
# Obtain ticker data
ticker_cik = pd.read_table('ticker.txt', names = ['Ticker', 'CIK'])
ticker_cik.Ticker = ticker_cik.apply(lambda x: str(x.Ticker).upper(), axis = 1)
ticker_cik.set_index('Ticker', inplace = True)

### Dow Jones 30

In [None]:
dj_companies = pd.read_html('https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average')[1]
dj_companies = dj_companies.loc[:,['Company', 'Symbol']].set_index('Symbol')

In [None]:
!curl -s https://www.sec.gov/include/ticker.txt >ticker.txt

In [None]:
dj_companies = dj_companies.join(ticker_cik, how = 'inner')
grep_pat = '|'.join(dj_companies.CIK.astype(str).to_list())
with open(META_DJ30, 'w') as f:
    f.write("%s" % grep_pat)
print(grep_pat)
dj_companies.head()

320193|318154|4962|12927|18230|1108524|858877|93410|1744489|1751788|886982|354950|773840|51143|50863|200406|19617|21344|63908|66740|310158|789019|320187|80424|86312|731766|1403161|732712|1618921|104169


Unnamed: 0,Company,CIK
AAPL,Apple Inc.,320193
AMGN,Amgen,318154
AXP,American Express,4962
BA,Boeing,12927
CAT,Caterpillar Inc.,18230


### S&P 500

In [None]:
sp_companies = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
sp_companies = sp_companies.loc[:, ['Symbol', 'Security']].set_index('Symbol')
sp_companies = sp_companies.merge(ticker_cik, left_index = True, right_index = True)

In [None]:
grep_pat = '|'.join(sp_companies.CIK.astype(str).to_list())
with open(META_SP500, 'w') as f:
    f.write("%s" % grep_pat)

When `extract.sh` is run for DJ30 companies, each file takes around 500 seconds (output can be seen on the console output). This might indicate that the scraping process of S&P500 companies to be even longer. Thus, Dow Jones 30 companies are explored initially. 

In [None]:
%%bash
cat 'Metadata/ticker_dj30.txt' | sh extract.sh

### Linking to 3.1: Shortening the Data Processing and Pipelining Process
GNU Parallel is used to improve the existing `extract.sh` before execution. However, as traffic limit must be considered, the total request-response time is measured. To measure this, the duration for a file is first measured. This approach and command is illustrated by [Joseph Scott](https://blog.josephscott.org/2011/10/14/timing-details-with-curl/):  
- Use the request-response information to obtain time-related measurements and output into `curl-format.txt`
- Calculate the time with `-w` option  
An example is shown below

In [None]:
%%bash 
curl -w "@curl-format.txt" -o /dev/null -s "https://www.sec.gov/Archives/edgar/data/320193/000032019320000101/wf-form4_160565582610158.xml"

     time_namelookup:  0.026844s
        time_connect:  0.033290s
     time_appconnect:  0.062047s
    time_pretransfer:  0.062207s
       time_redirect:  0.000000s
  time_starttransfer:  0.082111s
                     ----------
          time_total:  0.082261s


Assuming an upper limit of 0.1s for each call. Hence, within 1 second, 10 requests can be launched -- the **maximum** traffic limit of SEC EDGAR. GNU Parallel can be run, but it needs to be a delay for at least 0.92 seconds for each job. For easier coding, a delay of 1 second is used

In [None]:
%%bash
cat extract.sh # will cause traffic blockage because 10 parallel jobs imply at least 100 requests 
               # (10 times the traffic limit) unless with a delay

#!/bin/bash
read CIKs;

scrapDetailsFromLink() {
    xml_file=$(curl -s $4 | sed -ne '0,/<FILENAME>/s/<FILENAME>\(.*\)/\1/p');
    temp=$(echo $4 | sed -e 's/-//g');
    new_link=$(echo $temp | sed -e "s|.txt|/$xml_file|");
    printf "%s\b|%s|%s|%s\n" "$1" $2 $3 $new_link;
    sleep 1
}
export -f scrapDetailsFromLink

for i in $(seq 1993 1 2020); 
do
    for j in $(seq 1 1 4);
    do
        idx_file="https://www.sec.gov/Archives/edgar/full-index/$i/QTR$j/form.idx"
        if curl -o /dev/null --silent --fail --head $idx_file; then
            echo "Reading idx file for $i-QTR$j"
            SECONDS=0;
            dir_date=$(echo $idx_file | sed -rne "s|.*([0-9]{4})/QTR([1-4]).*|\1-QTR\2|p")
            curl -s $idx_file | grep -E "^4[[:space:]]" | grep -Ew $CIKs |
            awk -v home_link="https://www.sec.gov/Archives/" 'BEGIN{OFS="::"; ORS="\n"}
                {for(i=2;i<NF-2;i++) printf("%s ", $(i))
                print "", $(NF-2), $(NF - 1), home_link$(NF)}' |
            /

In [None]:
%%bash
cat 'Metadata/ticker_dj30.txt' | bash extract.sh

Reading idx file for 2010-QTR1
Runtime: 144
./trading_2010-QTR1.csv is appended into database document


In [None]:
%%bash
wc -l data_par.csv

1147 data_par.csv


$144/1147=0.12554\ seconds/link $  
$Number\ of\ links\ per\ second = 1147/144 = 7.97 requests\ per\ second$, still within the 10 requests per second limit

In [None]:
%%bash
cat extract_noPAR.sh

cat: extract_noPAR.sh: No such file or directory


In [None]:
%%bash
cat 'Metadata/ticker_dj30.txt' | bash extract_noPAR.sh

Reading idx file for 2010-QTR1
Runtime: 740
./trading_2010-QTR1.csv is appended into database document


In [None]:
%%bash
wc -l data.csv

1146 data.csv


$Period\ for\ one\ link = 740/1146 = 0.64572\ seconds/link$  
$Number\ of\ requests\ per\ second = 1146/740 = 1.5486\ requests\ per\ second$  
  
This implies a fivefold speedup when GNU Parallel is used appropriately

In [None]:
%%bash
cat 'ticker_dj30.txt' | bash extract.sh 

Reading idx file for 1993-QTR1
Reading idx file for 1993-QTR2
Reading idx file for 1993-QTR3
Reading idx file for 1993-QTR4
Reading idx file for 1994-QTR1
Reading idx file for 1994-QTR2
Reading idx file for 1994-QTR3
Reading idx file for 1994-QTR4
Reading idx file for 1995-QTR1
Reading idx file for 1995-QTR2
Reading idx file for 1995-QTR3
Reading idx file for 1995-QTR4
Reading idx file for 1996-QTR1
Reading idx file for 1996-QTR2
Reading idx file for 1996-QTR3
Reading idx file for 1996-QTR4
Reading idx file for 1997-QTR1
Reading idx file for 1997-QTR2
Reading idx file for 1997-QTR3
Reading idx file for 1997-QTR4
Reading idx file for 1998-QTR1
Reading idx file for 1998-QTR2
Reading idx file for 1998-QTR3
Reading idx file for 1998-QTR4
Reading idx file for 1999-QTR1
Reading idx file for 1999-QTR2
Reading idx file for 1999-QTR3
Reading idx file for 1999-QTR4
Reading idx file for 2000-QTR1
Reading idx file for 2000-QTR2
Reading idx file for 2000-QTR3
Reading idx file for 2000-QTR4
Reading 

## Extracting and Cleaning: Scraping Information from Form 4 Files

While all files that are labeled as Form 4 are extracted from SEC EDGAR, care must be taken that not all forms are in the same format.

As will be presented below, most Form 4 files are in XML format. The same structure allows automated extraction. However, the other files require a different approach. 

The code section below shows the total number of Form 4 files and the number of files that are not in XML format. Other files could be in `htm` or `txt` format. In addition, some of the links scraped do not even have a file extension because the script executed in the previous section identifies the `<FILENAME>` tag in the `txt` file in order to locate the XML or HTM file.

In [None]:
%%bash
wc -l '/content/drive/My Drive/URECA/form4_data.csv'    # number of Form 4 files harvested
cat '/content/drive/My Drive/URECA/form4_data.csv' | grep -vc "xml$"  # number of files not of xml

63439 /content/drive/My Drive/URECA/form4_data.csv
1336


Inspecting the links of these 2.25% of outlier files:

In [None]:
cat '/content/drive/My Drive/URECA/form4_data.csv' | grep -v "xml$"

Company|CIK|Date|XML Link
NIKE INC|320187|1996-06-11|https://www.sec.gov/Archives/edgar/data/320187/000090385596000004/
NIKE INC|320187|2001-11-05|https://www.sec.gov/Archives/edgar/data/320187/000090385501500006/edgar.txt
GOLDMAN SACHS GROUP INC/|886982|2001-12-10|https://www.sec.gov/Archives/edgar/data/886982/000076999301500239/kind40111kind.txt
AMERICAN EXPRESS CO|4962|2003-04-01|https://www.sec.gov/Archives/edgar/data/4962/000000496203000046/edgar.txt
AMERICAN EXPRESS CO|4962|2003-04-01|https://www.sec.gov/Archives/edgar/data/4962/000000496203000048/edgar.txt
AMERICAN EXPRESS CO|4962|2003-04-01|https://www.sec.gov/Archives/edgar/data/4962/000000496203000044/edgar.txt
AMERICAN EXPRESS CO|4962|2003-04-01|https://www.sec.gov/Archives/edgar/data/4962/000000496203000049/edgar.txt
AMERICAN EXPRESS CO|4962|2003-04-01|https://www.sec.gov/Archives/edgar/data/4962/000000496203000050/edgar.txt
AMERICAN EXPRESS CO|4962|2003-04-29|https://www.sec.gov/Archives/edgar/data/4962/0000004962

Since there are three file types, three approaches are needed to scrap data. In this notebook, more steps are illustrated for XML files. The extraction of the other two file types involves similar steps.

In [None]:
%%bash
echo "Company|CIK|Date|XML Link" >Database/form4_data_xml.csv
cat '/content/drive/My Drive/URECA/form4_data.csv' | grep "xml$" >>Database/form4_data_xml.csv # extract files with xml file types

In [None]:
data = pd.read_csv(FORM4_LINK_XML, sep='|')

### Scrap Information from Each Form 
Form 4 files are stored in three formats: HTM, XML and txt files.  

This research scraps the data from XML files with the help of Beautiful Soup. To begin with, the important attributes to be scraped are identified. An example of the XML file is used to explore the attributes

In [None]:
content = requests.get("https://www.sec.gov/Archives/edgar/data/318154/000118143104004871/rrd30540.xml").content
bs_content = bs(content, 'lxml')
print(bs_content.prettify())

<?xml version="1.0"?>
<html>
 <body>
  <ownershipdocument>
   <schemaversion>
    X0201
   </schemaversion>
   <documenttype>
    4
   </documenttype>
   <periodofreport>
    2004-01-27
   </periodofreport>
   <notsubjecttosection16>
    0
   </notsubjecttosection16>
   <issuer>
    <issuercik>
     0000318154
    </issuercik>
    <issuername>
     AMGEN INC
    </issuername>
    <issuertradingsymbol>
     AMGN
    </issuertradingsymbol>
   </issuer>
   <reportingowner>
    <reportingownerid>
     <rptownercik>
      0000904017
     </rptownercik>
     <rptownername>
      JOHNSON FRANKLIN P JR
     </rptownername>
    </reportingownerid>
    <reportingowneraddress>
     <rptownerstreet1>
      ONE AMGEN CENTER DRIVE
     </rptownerstreet1>
     <rptownerstreet2>
     </rptownerstreet2>
     <rptownercity>
      THOUSAND OAKS
     </rptownercity>
     <rptownerstate>
      CA
     </rptownerstate>
     <rptownerzipcode>
      91320-1799
     </rptownerzipcode>
     <rptownerstatedescri

A few observations are made:
- Each issuer (the reporting firm or representative) can file many Form 4
- Each reporter can file many Form 4, but the number of shares owned after the transaction is related to the reporter, not the issuer
- Each Form 4 is labeled with **an** Accession Number, which is composed of [three parts](https://www.sec.gov/edgar/searchedgar/accessing-edgar-data.htm):
    - CIK of the firm 
    - last two digits of the filing year
    - sequential count of submitted filings from that firm that year
- **Each** Form 4 file can have **multiple** transactions. This research will only focus on non-derivative transactions.
- This implies that each file has a unique Accession Number, which can assist in merging tables for exploratory analysis.

Firstly, the main table that details a Form 4 file for each entry is built. This file is stored in *form4_detail.csv*. Useful information includes

| Detail | Variable in DataFrame |
| :--- | :--- |
Trading Symbol | `trad_symbol`
Accession Number | `accession_no`
Form Number (to double-check if it is Form 4) | `form`
Reporter's name | `reporter_name`
Reporter's CIK | `reporter_cik`
Reporter's Title in the Company | `reporter_title`
Whether Trader is a Director | `is_director`
Whether Trader is an Officer | `is_officer`
Whether Trader is a Beneficial Owner (i.e. owns at least 10% of the company shares) | `is_beneficial_owner`
Whether Trader is none of the above three | `is_other`

In [None]:
def extract_xml(record):
    time.sleep(0.085)
    content = requests.get(record['XML Link']).content
    bs_content = bs(content, 'lxml')

    accession_no = str(re.split('/', record['XML Link'])[-2])
    try:
        trad_symbol = bs_content.find("issuertradingsymbol").text
    except:
        print(record['XML Link'])
        trad_symbol = ""      #bs_content.find("issuername").text
    try:
        form = bs_content.find("documenttype").text
    except:
        print(record['XML Link'], '<documenttype> not found')
        form = 4
    try:
        reporter_name = bs_content.find("rptownername").text.title()
    except:
        print(record['XML Link'], '<rptownername> not found')
        reporter_name = np.NaN
    try:
        reporter_cik = bs_content.find("rptownercik").text
    except:
        print(record['XML Link'], '<rptownercik> not found')
        reporter_cik = np.NaN

    try:
        is_director = int(bs_content.find("isdirector").text)
    except:
        is_director = 0
    
    try:
        is_officer = int(bs_content.find("isofficer").text)
    except:
        is_officer = 0
    
    try:
        is_beneficial_owner = int(bs_content.find("istenpercentowner").text)
    except:
        is_beneficial_owner = 0
    
    try:
        is_other = int(bs_content.find("isother").text)
    except:
        is_other = 0
        
    try:
        reporter_title = bs_content.find("officertitle").text
    except:
        reporter_title = np.NaN
        
    return [trad_symbol, accession_no, form, reporter_name, reporter_cik, reporter_title, is_director, is_officer, is_beneficial_owner, is_other]

In [None]:
%%time
data[['trad_symbol', 'accession_no', 'form', 'reporter_name', 'reporter_cik', 'reporter_title', 'is_director', 'is_officer', 'is_beneficial_owner', 'is_other']] = data.apply(extract_xml, axis = 1, result_type = 'expand')
data.to_csv(FORM4_DETAIL_XML, sep = '|', index = False)
data.head()

Unnamed: 0,Company,CIK,Date,XML Link,trad_symbol,accession_no,form,reporter_name,reporter_cik,reporter_title,is_director,is_officer,is_beneficial_owner,is_other
0,3M CO,66740,2014-05-01,https://www.sec.gov/Archives/edgar/data/66740/...,MMM,112760214015755,4,Palensky Fred J,1197743,EXEC VP R&D & CHF TECH OFF,0,1,0,0
1,3M CO,66740,2014-05-07,https://www.sec.gov/Archives/edgar/data/66740/...,MMM,112760214016681,4,Thulin Inge G,1263739,"Chairman, President & CEO",1,1,0,0
2,3M CO,66740,2014-05-01,https://www.sec.gov/Archives/edgar/data/66740/...,MMM,112760214015749,4,Palensky Fred J,1197743,EXEC VP R&D & CHF TECH OFF,0,1,0,0
3,3M CO,66740,2014-05-14,https://www.sec.gov/Archives/edgar/data/66740/...,MMM,112760214017542,4,Coffman Vance D,1193998,,1,0,0,0
4,3M CO,66740,2014-05-02,https://www.sec.gov/Archives/edgar/data/66740/...,MMM,112760214016109,4,Gangestad Nicholas C,1515709,VP CORP CNTRLR & CF ACCTG OFC,0,1,0,0


### Scrap Transactions from Each Form 4
The above table only keeps information about the reporter, date and company. However, the gist -- transaction details, is not yet scraped.  

As each file can have multiple non-derivative transactions, a new table is created. To join both tables for exploratory analysis, the accession number is also kept in this table as the primary key.  

This transaction table `tx_data` stores the following details:  

| Transaction Detail | Variable in DataFrame |
:--- | :--- 
Accession Number | `accession_no`
CIK | `CIK`
Stock Type (e.g. common stock) | `stock_type`
Transaction Date | `date`
Transaction Code | `code`
Acquired (A) or Disposed (D) | `acquired`
Number of Shares Transacted | `shares_exchanged`
Number of Shares Owned After Transaction | `net_shares_owned`
Stock Price During Transaction | `stock_price`
Whether Reporter has Direct Ownership | `direct_ownership`

In [None]:
data = pd.read_csv(FORM4_DETAIL_XML, sep='|')

In [None]:
content = requests.get('https://www.sec.gov/Archives/edgar/data/318154/000112760214017221/form4.xml').content
bs_content = bs(content, 'lxml')
# print(bs_content.prettify())
transactions = bs_content.find_all('nonderivativetransaction')
collected_data = []
for transaction in transactions:
  CIK = row.CIK
  accession_no = row.accession_no
try:
  stock_type = transaction.find('securitytitle').find('value').text 
except:
  stock_type = np.nan
  print("<securitytitle> not found")

try:
  date = datetime.datetime.strptime(transaction.find('transactiondate').find('value').text, "%Y-%m-%d").date()    # transaction date
except:
  date = np.nan
  print("<transactiondate> not found")

try:
  code = transaction.find('transactioncode').text       # # transaction code could be M, F
except:
  code = np.nan
  print("<transactioncode> not found")

try:
  acquired = (transaction.find('transactionacquireddisposedcode').find('value').text == 'A')
except:
  acquired = np.nan
  print("<transactionacquireddisposedcode> not found")

try:
  shares_exchanged = float(transaction.find('transactionshares').find('value').text)
  if acquired == False:
      shares_exchanged = -float(transaction.find('transactionshares').find('value').text)
except:
  shares_exchanged = np.nan
  print("<transactionshares> not found")


try:
  net_shares_owned = float(transaction.find('sharesownedfollowingtransaction').find('value').text)
except:
  print("<sharesownedfollowingtransaction> not found")

try:
  stock_price = float(transaction.find('transactionpricepershare').find('value').text)    # no stock price info if shares are 
except ValueError:
  stock_price = np.nan
  print("<transactionpricepershare> not found")

try:     
  direct_ownership = (transaction.find('directorindirectownership').find('value').text == 'D')     # ownership: Direct (D) or Indirect (I)
except:
  direct_ownership = np.nan
  print("<directorindirectownership> not found")

collected_data.append([accession_no, CIK, stock_type, date, code, acquired, shares_exchanged, net_shares_owned, stock_price, direct_ownership])

In [None]:
%%time
# CPU times: user 1h 10min 30s, sys: 1min 26s, total: 1h 11min 57s
# Wall time: 3h 11min 56s
tx_data = pd.DataFrame(columns=['accession_no', 'CIK', 'stock_type', 'date', 'code', 'acquired', 'shares_exchanged', 'net_shares_owned', 'stock_price', 'direct_ownership'])
not_found_data = pd.DataFrame(columns=['XML Link', 'Tag'])
not_found_tag = []

for index, row in data.loc[:, ['accession_no', 'CIK', 'XML Link']].iterrows():
    time.sleep(0.02)
    content = requests.get(row['XML Link']).content
    bs_content = bs(content, 'lxml')

    transactions = bs_content.find_all('nonderivativetransaction')
    collected_data = []
    for transaction in transactions:
        CIK = row.CIK
        accession_no = row.accession_no
        try:
            stock_type = transaction.find('securitytitle').find('value').text 
        except:
            stock_type = np.nan
            not_found_tag.append([row['XML Link'], "securitytitle"])

        try:
            date = datetime.datetime.strptime(transaction.find('transactiondate').find('value').text, "%Y-%m-%d").date()    # transaction date
        except:
            date = np.nan
            not_found_tag.append([row['XML Link'], "transactiondate"])
        
        try:
            code = transaction.find('transactioncode').text       # # transaction code could be M, F
        except:
            code = np.nan
            not_found_tag.append([row['XML Link'], "transactioncode"])

        try:
            acquired = (transaction.find('transactionacquireddisposedcode').find('value').text == 'A')
        except:
            acquired = np.nan
            not_found_tag.append([row['XML Link'], "transactionacquireddisposedcode"])
        
        try:
            shares_exchanged = float(transaction.find('transactionshares').find('value').text)
            if acquired == False:
                shares_exchanged = -float(transaction.find('transactionshares').find('value').text)
        except:
            shares_exchanged = np.nan
            not_found_tag.append([row['XML Link'], "transactionshares"])
        
        try:
            net_shares_owned = float(transaction.find('sharesownedfollowingtransaction').find('value').text)
        except:
            net_shares_owned = np.nan
            not_found_tag.append([row['XML Link'], "sharesownedfollowingtransaction"])

        try:
            stock_price = float(transaction.find('transactionpricepershare').find('value').text)    # no stock price info if shares are 
        except:
            stock_price = np.nan
            not_found_tag.append([row['XML Link'], "transactionpricepershare"])

        try:     
            direct_ownership = (transaction.find('directorindirectownership').find('value').text == 'D')     # ownership: Direct (D) or Indirect (I)
        except:
            direct_ownership = np.nan
            not_found_tag.append([row['XML Link'], "directorindirectownership"])

        collected_data.append([accession_no, CIK, stock_type, date, code, acquired, shares_exchanged, net_shares_owned, stock_price, direct_ownership])
        
    collected_data = pd.DataFrame(collected_data, columns = tx_data.columns)
    tx_data = pd.concat([tx_data, collected_data], ignore_index = True)

not_found_data = pd.DataFrame(not_found_tag, columns = not_found_data.columns)

CPU times: user 1h 10min 30s, sys: 1min 26s, total: 1h 11min 57s
Wall time: 3h 11min 56s


In [None]:
tx_data.head()

Unnamed: 0,accession_no,CIK,stock_type,date,code,acquired,shares_exchanged,net_shares_owned,stock_price,direct_ownership
0,112760214015755,66740,Common Stock,2014-04-30,S,False,-5288.0,47963.0,139.1,True
1,112760214015755,66740,Common Stock,2014-04-30,S,False,-1200.0,46763.0,139.1007,True
2,112760214015755,66740,Common Stock,2014-04-30,S,False,-8642.0,38121.0,139.11,True
3,112760214015755,66740,Common Stock,2014-04-30,S,False,-300.0,37821.0,139.1107,True
4,112760214015755,66740,Common Stock,2014-04-30,S,False,-500.0,37321.0,139.112,True


In [None]:
tx_data.to_csv(FORM4_TX_XML, sep='|', index = False)
files.download(FORM4_TX_XML)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Repeat for HTM Files

In [None]:
%%bash
echo "Company|CIK|Date|XML Link" >"/content/drive/My Drive/URECA/Database/form4_data_htm.csv"
cat '/content/drive/My Drive/URECA/Database/form4_data.csv' | grep "htm$" >>"/content/drive/My Drive/URECA/Database/form4_data_htm.csv"       # extract files with htm file types

In [None]:
data = pd.read_csv(FORM4_LINK_HTM, sep='|')

In [None]:
data.iloc[20]['XML Link']

'https://www.sec.gov/Archives/edgar/data/12927/000089102003001303/bel499.htm'

In [None]:
# record = data.iloc[20]
def extract_htm(record):
  content = requests.get(record['XML Link']).content
  bs_content = bs(content, 'lxml')
  print(record['XML Link'])
  accession_no = str(re.split('/', record['XML Link'])[-2])
  reporter_cik = np.nan
  tables = bs_content.find_all("table")
  form = ("FORM .*", tables[0].find("font", {"size": "4"}).text == "FORM 4")
  if form:
    form = 4
  else:
    form = np.nan

  try:
    reporter_name = tables[1].find_all("div")[1].text
  except:
    reporter_name = np.nan

  reporter_details = tables[1].find_all("td")[3].find_all("div")
  try:
    reporter_title = reporter_details[4].find("u").text
  except:
    reporter_title = np.nan
  try:
    is_director = (reporter_details[2].find_all("u")[0].text == "X")
  except:
    is_director = np.nan
  try:
    is_beneficial_owner = (reporter_details[2].find_all("u")[1].text == "X")
  except:
    is_beneficial_owner = np.nan
  try:
    is_officer = (reporter_details[3].find_all("u")[0].text == "X")
  except: is_officer = np.nan
  try:
    is_other = (reporter_details[3].find_all("u")[1].text == "X")
  except:
    is_other = np.nan
  # print(is_director)
  # print(is_beneficial_owner)
  # print(is_officer)
  # print(is_other)
  table_meta = bs_content.find_all("table")[1]
  issuer_and_trad_symbol = table_meta.find_all('td')[1].find_all('div')[1].text   # not yet extract
  # print(issuer_and_trad_symbol)
  # trad_symbol = re.search(r'.*\s([^\s]*)$', issuer_and_trad_symbol).group(0)
  try:
    trad_symbol = re.search(r'.*[(|\s]([A-Z]+)\)?$', issuer_and_trad_symbol).group(1)
  except:
    trad_symbol = np.nan
  # trad_symbol = re.search(r'.*[(\s]([A-Z]+)[)]$', issuer_and_trad_symbol).group(1)
  # print(trad_symbol)
  return [trad_symbol, accession_no, form, reporter_name, reporter_cik, reporter_title, is_director, is_officer, is_beneficial_owner, is_other]

In [None]:
%%time
data[['trad_symbol', 'accession_no', 'form', 'reporter_name', 'reporter_cik', 'reporter_title', 'is_director', 'is_officer', 'is_beneficial_owner', 'is_other']] = data.apply(extract_htm, axis = 1, result_type = 'expand')
data.to_csv(FORM4_DETAIL_HTM, sep = '|', index = False)
data.head()

https://www.sec.gov/Archives/edgar/data/318154/000118143103006258/rrd7824.htm
https://www.sec.gov/Archives/edgar/data/318154/000118143103006259/rrd7832.htm
https://www.sec.gov/Archives/edgar/data/318154/000118143103006350/rrd7904.htm
https://www.sec.gov/Archives/edgar/data/318154/000118143103005239/rrd7039.htm
https://www.sec.gov/Archives/edgar/data/318154/000118143103006360/rrd7857.htm
https://www.sec.gov/Archives/edgar/data/318154/000118143103006766/rrd8130.htm
https://www.sec.gov/Archives/edgar/data/12927/000089102003001075/mcn492.htm
https://www.sec.gov/Archives/edgar/data/12927/000089102003001074/gra491.htm
https://www.sec.gov/Archives/edgar/data/12927/000089102003001077/sha494.htm
https://www.sec.gov/Archives/edgar/data/12927/000089102003001076/pla493.htm
https://www.sec.gov/Archives/edgar/data/12927/000089102003001078/sto487.htm
https://www.sec.gov/Archives/edgar/data/12927/000089102003001079/big488.htm
https://www.sec.gov/Archives/edgar/data/12927/000089102003001080/bry489.htm


IndexError: ignored

In [None]:
content = requests.get(data.iloc[0]['XML Link']).content
bs_content = bs(content, 'lxml')
print(bs_content.prettify())

<html>
 <body>
  <document>
   <type>
    4
    <sequence>
     1
     <filename>
      rrd7824.htm
      <description>
       FORM 4
       <text>
        <title>
        </title>
        SEC Form 4
        <table border="1" cellpadding="4" cellspacing="0" width="100%">
         <tr>
          <td valign="top" width="254">
           <div align="center">
            <font face="arial,helvetica" size="4">
             <b>
              FORM 4
             </b>
            </font>
           </div>
           <br/>
           <div>
            <font face="arial,helvetica" size="1">
             <b>
              [  ] Check this box if no longer
              <br/>
              subject to Section 16.  Form 4 or Form
              <br/>
              5 obligations may continue.
              <br/>
              See Instruction 1(b).
             </b>
            </font>
           </div>
          </td>
          <td align="center" valign="middle" width="764">
           <font face="aria

### Repeat for TXT Files

In [None]:
%%bash
echo "Company|CIK|Date|XML Link" >Database/form4_data_txt.csv
cat 'Database/form4_data.csv' | grep "txt$" >>Database/form4_data_txt.csv       # extract files with htm file types

In [None]:
data = pd.read_csv(FORM4_LINK_TXT, sep='|')

In [None]:
pd.set_option('display.max_columns', 500)
data

Unnamed: 0,Company,CIK,Date,XML Link
0,NIKE INC,320187,2001-11-05,https://www.sec.gov/Archives/edgar/data/320187...
1,GOLDMAN SACHS GROUP INC/,886982,2001-12-10,https://www.sec.gov/Archives/edgar/data/886982...
2,AMERICAN EXPRESS CO,4962,2003-04-01,https://www.sec.gov/Archives/edgar/data/4962/0...
3,AMERICAN EXPRESS CO,4962,2003-04-01,https://www.sec.gov/Archives/edgar/data/4962/0...
4,AMERICAN EXPRESS CO,4962,2003-04-01,https://www.sec.gov/Archives/edgar/data/4962/0...
...,...,...,...,...
393,GOLDMAN SACHS GROUP INC/,886982,2002-03-11,https://www.sec.gov/Archives/edgar/data/886982...
394,GOLDMAN SACHS GROUP INC/,886982,2002-02-11,https://www.sec.gov/Archives/edgar/data/886982...
395,GOLDMAN SACHS GROUP INC/,886982,2002-01-28,https://www.sec.gov/Archives/edgar/data/886982...
396,GOLDMAN SACHS GROUP INC/,886982,2002-02-12,https://www.sec.gov/Archives/edgar/data/886982...


# Exploratory Analysis

# Appendix
This section discusses about two things:
1. More complicated commands used with examples: standalone code will be run to illustrate its use
2. Specific code run to ensure quicker research process. This code is not directly related to the research focus. Rather, it serves to improve the efficiency of the whole process

## Quickening Research Process

Even though GNU Parallel has existed for more than a decade, some machines do not have it installed. Thus, the command below is run to install it. Note that GNU Parallel utility should be referenced as `/usr/local/bin/parallel` instead of parallel (which is a `gnu` version) unless an alias is set in `~/.bashrc`

In [None]:
%%bash
whereis parallel

parallel: /usr/bin/parallel /usr/local/bin/parallel /usr/share/man/man1/parallel.1.gz


In [None]:
%%bash
wget http://ftp.gnu.org/gnu/parallel/parallel-latest.tar.bz2
sudo tar xjf parallel-latest.tar.bz2
cd parallel-20210322
sudo ./configure && make
sudo make install
cd
# wget http://ftp.gnu.org/gnu/parallel/parallel-latest.tar.bz2; sudo tar xjf parallel-latest.tar.bz2; cd parallel-20210222; sudo ./configure && make; sudo make install; cd

checking for a BSD-compatible install... /usr/bin/install -c
checking whether build environment is sane... yes
checking for a thread-safe mkdir -p... /bin/mkdir -p
checking for gawk... no
checking for mawk... mawk
checking whether make sets $(MAKE)... yes
checking whether make supports nested variables... yes
checking whether ln -s works... yes
checking that generated files are newer than configure... done
configure: creating ./config.status
config.status: creating Makefile
config.status: creating src/Makefile
config.status: creating config.h
make  all-recursive
make[1]: Entering directory '/content/parallel-20210322'
Making all in src
make[2]: Entering directory '/content/parallel-20210322/src'
make[2]: Nothing to be done for 'all'.
make[2]: Leaving directory '/content/parallel-20210322/src'
make[2]: Entering directory '/content/parallel-20210322'
make[2]: Leaving directory '/content/parallel-20210322'
make[1]: Leaving directory '/content/parallel-20210322'
Making install in src
make[

--2021-03-24 09:21:24--  http://ftp.gnu.org/gnu/parallel/parallel-latest.tar.bz2
Resolving ftp.gnu.org (ftp.gnu.org)... 209.51.188.20, 2001:470:142:3::b
Connecting to ftp.gnu.org (ftp.gnu.org)|209.51.188.20|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2233628 (2.1M) [application/x-bzip2]
Saving to: ‘parallel-latest.tar.bz2’

     0K .......... .......... .......... .......... ..........  2%  296K 7s
    50K .......... .......... .......... .......... ..........  4%  603K 5s
   100K .......... .......... .......... .......... ..........  6% 33.1M 3s
   150K .......... .......... .......... .......... ..........  9% 53.5M 3s
   200K .......... .......... .......... .......... .......... 11%  603K 3s
   250K .......... .......... .......... .......... .......... 13% 63.2M 2s
   300K .......... .......... .......... .......... .......... 16% 62.8M 2s
   350K .......... .......... .......... .......... .......... 18%  110M 2s
   400K .......... .......... .......

Google Cloud Console is used to run `extract.sh`. However, the console quickly loses connection. To resolve this, click events are simulated by inserting the [JavaScript](https://stackoverflow.com/questions/49976573/why-google-cloud-shell-auto-disconnect-after-1-hours) code below to the browser's console

In [None]:
%%javascript
setInterval(function() {document.elementFromPoint(500, 500).click();}, 30000);

As running the script is power-intensive, this notebook is run on Google Colab. Colab may have session timeout when it is idle even for 20 minutes. However, the whole script runs more than 2 hours. To resolve this, an iterative click simulation is performed in [JavaScript](https://stackoverflow.com/questions/54057011/google-colab-session-timeout)

In [None]:
%%javascript
function ClickToConnect(){             // only run in Colab
    document.querySelector('#top-toolbar > colab-connect-button').shadowRoot.querySelector('#connect').click();
    console.log("Connecting")
}
setInterval(ClickToConnect, 60*1000)
// clearInterval(ClickToConnect)  // to stop the click event

<IPython.core.display.Javascript object>

`tee` can print the output to files and stdin simulatneously. This is useful for executing the actual code while showing some content in the file for easier understanding. However, one can argue that printing to a file and then using `head` could achieve the same results.  

Is there any difference in terms of time? Will using `tee` slow the output process? The code below outputs the statistics into a file, with the size of the file content as the variable

In [None]:
%%bash
echo "Approach 1: without saving to a file, use tee to send output to head" >tee_head_time.csv
echo "Number of Rows: Duration (milliseconds)" >>tee_head_time.csv
for ((row_size = 10; row_size < 1000000000; row_size*=10)) do
    start=$(($(date +%s%N)/1000000));
    echo "digit_1,mul2,digit_2,digit_3,digit_4,digit_5,mod3,mod5,doubledigit_1,digit_6,digit_7,digit_8,mod4,doubledigit_2,digit_9,digit_10,digit_11,mod7,triple,doubledigit_3,digit12,digit13,digit14"
    for i in $(seq 1 1 $row_size); do 
        tmp=$(($i%10));
        echo $tmp,$(($(($i*2))%10)),$tmp,$tmp,$tmp,$tmp,$(($tmp % 3)),$(($tmp % 5)),$tmp$tmp,$tmp,$tmp,$tmp,$(($tmp % 4)),$tmp$tmp,$tmp,$tmp,$tmp,$(($tmp % 7)),$tmp$tmp$tmp,$tmp$tmp,$tmp,$tmp,$(($tmp % 4)); 
    done | tee | head 
    end=$(($(date +%s%N)/1000000))
    echo $row_size: $(($end-$start)) >>tee_head_time.csv
done;

echo "Approach 2: save output in a file, then use head utility" >>tee_head_time.csv
echo "Number of Rows: Duration (milliseconds)" >>tee_head_time.csv
for ((row_size = 10; row_size < 1000000000; row_size*=10)) do
    start=$(($(date +%s%N)/1000000));
    echo "digit_1,mul2,digit_2,digit_3,digit_4,digit_5,mod3,mod5,doubledigit_1,digit_6,digit_7,digit_8,mod4,doubledigit_2,digit_9,digit_10,digit_11,mod7,triple,doubledigit_3,digit12,digit13,digit14" >test.csv
    for i in $(seq 1 1 $row_size); do 
        tmp=$(($i%10));
        echo $tmp,$(($(($i*2))%10)),$tmp,$tmp,$tmp,$tmp,$(($tmp % 3)),$(($tmp % 5)),$tmp$tmp,$tmp,$tmp,$tmp,$(($tmp % 4)),$tmp$tmp,$tmp,$tmp,$tmp,$(($tmp % 7)),$tmp$tmp$tmp,$tmp$tmp,$tmp,$tmp,$(($tmp % 4)); 
    done >>test.csv
    head test.csv
    end=$(($(date +%s%N)/1000000))
    echo $row_size: $(($end-$start)) >>tee_head_time.csv
    rm test.csv;
done;

# getting milliseconds [https://stackoverflow.com/questions/16548528/command-to-get-time-in-milliseconds]

digit_1,mul2,digit_2,digit_3,digit_4,digit_5,mod3,mod5,doubledigit_1,digit_6,digit_7,digit_8,mod4,doubledigit_2,digit_9,digit_10,digit_11,mod7,triple,doubledigit_3,digit12,digit13,digit14
1,2,1,1,1,1,1,1,11,1,1,1,1,11,1,1,1,1,111,11,1,1,1
2,4,2,2,2,2,2,2,22,2,2,2,2,22,2,2,2,2,222,22,2,2,2
3,6,3,3,3,3,0,3,33,3,3,3,3,33,3,3,3,3,333,33,3,3,3
4,8,4,4,4,4,1,4,44,4,4,4,0,44,4,4,4,4,444,44,4,4,0
5,0,5,5,5,5,2,0,55,5,5,5,1,55,5,5,5,5,555,55,5,5,1
6,2,6,6,6,6,0,1,66,6,6,6,2,66,6,6,6,6,666,66,6,6,2
7,4,7,7,7,7,1,2,77,7,7,7,3,77,7,7,7,0,777,77,7,7,3
8,6,8,8,8,8,2,3,88,8,8,8,0,88,8,8,8,1,888,88,8,8,0
9,8,9,9,9,9,0,4,99,9,9,9,1,99,9,9,9,2,999,99,9,9,1
0,0,0,0,0,0,0,0,00,0,0,0,0,00,0,0,0,0,000,00,0,0,0
digit_1,mul2,digit_2,digit_3,digit_4,digit_5,mod3,mod5,doubledigit_1,digit_6,digit_7,digit_8,mod4,doubledigit_2,digit_9,digit_10,digit_11,mod7,triple,doubledigit_3,digit12,digit13,digit14
1,2,1,1,1,1,1,1,11,1,1,1,1,11,1,1,1,1,111,11,1,1,1
2,4,2,2,2,2,2,2,22,2,2,2,2,22,2,2,2,2,222,22,2,2,2
3,6,3,3,3,3,