In [1]:
import os
import edgar
import subprocess
import pandas as pd

In [2]:
# unload files into 'unload' folder
folder = 'unload'
startYear = '2017'
cmd = f'python run.py -y {startYear} -d {folder}'

process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
output, error = process.communicate()
print('\n'.join(output.decode("utf-8").split(' - ')))

2018-08-15 15:39:53,153
DEBUG
downloads will be saved to unload
2018-08-15 15:39:53,153
DEBUG
downloading files since 2017
2018-08-15 15:39:53,153
INFO
7 index files to retrieve
2018-08-15 15:39:53,153
DEBUG
worker count: 4
2018-08-15 15:39:54,842
INFO
> downloaded https://www.sec.gov/Archives/edgar/full-index/2017/QTR4/master.zip to unload/2017-QTR4.tsv
2018-08-15 15:39:54,870
INFO
> downloaded https://www.sec.gov/Archives/edgar/full-index/2018/QTR3/master.zip to unload/2018-QTR3.tsv
2018-08-15 15:39:55,306
INFO
> downloaded https://www.sec.gov/Archives/edgar/full-index/2018/QTR2/master.zip to unload/2018-QTR2.tsv
2018-08-15 15:39:56,161
INFO
> downloaded https://www.sec.gov/Archives/edgar/full-index/2017/QTR2/master.zip to unload/2017-QTR2.tsv
2018-08-15 15:39:56,671
INFO
> downloaded https://www.sec.gov/Archives/edgar/full-index/2017/QTR3/master.zip to unload/2017-QTR3.tsv
2018-08-15 15:39:56,842
INFO
> downloaded https://www.sec.gov/Archives/edgar/full-index/2018/QTR1/master.zip to

In [3]:
# merge unloaded files into master index in 'mstrIdx' folder
mstr_folder = 'mstrIdx'
os.makedirs(mstr_folder, exist_ok=True)
# cmd = f'cat {folder}/*.tsv > {mstr_folder}/master.tsv'

file_lst = [x for x in os.listdir(folder) if x[-4:]=='.tsv']
with open(f'{mstr_folder}/master.tsv', 'w') as f:
    for fname in file_lst:
        with open(f'{folder}/{fname}') as infile:
            for line in infile:
                f.write(line)
        print(fname)

2017-QTR2.tsv
2017-QTR4.tsv
2017-QTR3.tsv
2018-QTR2.tsv
2018-QTR3.tsv
2018-QTR1.tsv
2017-QTR1.tsv


In [4]:
# Select security and find its index
code = '1000045'
idx_dir = 'idx'
os.makedirs(idx_dir, exist_ok=True)

idx_list = []
for file in [x for x in os.listdir(folder) if x[-4:]=='.tsv']:
    with open(f'{folder}/{file}') as f:
        for line in f:
            if code in line:
                idx_list.append(line)
idx_list = [x.split('|') for x in idx_list]

In [5]:
idxDF = pd.DataFrame(idx_list, columns=['Code', 'Name', 'fileType', 'reportDate', 'raw_path', 'url'])
print(idxDF.head())

      Code                    Name fileType  reportDate  \
0  1000045  NICHOLAS FINANCIAL INC     10-K  2017-06-14   
1  1000045  NICHOLAS FINANCIAL INC        4  2017-05-24   
2  1000045  NICHOLAS FINANCIAL INC        4  2017-05-30   
3  1000045  NICHOLAS FINANCIAL INC        4  2017-05-30   
4  1000045  NICHOLAS FINANCIAL INC        4  2017-05-30   

                                      raw_path  \
0  edgar/data/1000045/0001193125-17-203193.txt   
1  edgar/data/1000045/0001398344-17-006781.txt   
2  edgar/data/1000045/0001000045-17-000001.txt   
3  edgar/data/1000045/0001000045-17-000002.txt   
4  edgar/data/1000045/0001000045-17-000003.txt   

                                                 url  
0  edgar/data/1000045/0001193125-17-203193-index....  
1  edgar/data/1000045/0001398344-17-006781-index....  
2  edgar/data/1000045/0001000045-17-000001-index....  
3  edgar/data/1000045/0001000045-17-000002-index....  
4  edgar/data/1000045/0001000045-17-000003-index....  


In [6]:
idxDF['fileType'].unique()

array(['10-K', '4', '8-K', 'CORRESP', 'SC 13D/A', 'UPLOAD', '10-Q', '3',
       '4/A', 'DEF 14A', 'SC 13G/A', 'SC 13G', 'SC 13D'], dtype=object)

#### 10-Q

In [7]:
filing_10q_lst = idxDF.loc[idxDF['fileType']=='10-Q'].reset_index()
print(filing_10q_lst)

   index     Code                    Name fileType  reportDate  \
0     16  1000045  NICHOLAS FINANCIAL INC     10-Q  2017-11-09   
1     28  1000045  NICHOLAS FINANCIAL INC     10-Q  2017-08-09   
2     62  1000045  NICHOLAS FINANCIAL INC     10-Q  2018-08-14   
3     73  1000045  NICHOLAS FINANCIAL INC     10-Q  2018-02-09   
4     87  1000045  NICHOLAS FINANCIAL INC     10-Q  2017-02-09   

                                      raw_path  \
0  edgar/data/1000045/0001193125-17-338757.txt   
1  edgar/data/1000045/0001193125-17-252199.txt   
2  edgar/data/1000045/0001193125-18-248315.txt   
3  edgar/data/1000045/0001193125-18-037381.txt   
4  edgar/data/1000045/0001193125-17-035551.txt   

                                                 url  
0  edgar/data/1000045/0001193125-17-338757-index....  
1  edgar/data/1000045/0001193125-17-252199-index....  
2  edgar/data/1000045/0001193125-18-248315-index....  
3  edgar/data/1000045/0001193125-18-037381-index....  
4  edgar/data/1000045/00011

In [8]:
tmp_lst = filing_10q_lst.T.to_dict()
for i in tmp_lst:
    init_ = 'https://www.sec.gov/Archives'
    url_path = '/'.join([init_, tmp_lst[i]['raw_path']])
    date_ = tmp_lst[i]['reportDate'].replace('-', '')
    data_root = 'data/' + str(tmp_lst[i]['fileType'])
    data_folder = '/'.join([data_root, code])
    os.makedirs(data_folder, exist_ok=True)
    cmd = f'curl {url_path} --output {data_folder}/{code}_{date_}.html'
    print(cmd)

    process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
    output, error = process.communicate()

curl https://www.sec.gov/Archives/edgar/data/1000045/0001193125-17-338757.txt --output data/10-Q/1000045/1000045_20171109.html
curl https://www.sec.gov/Archives/edgar/data/1000045/0001193125-17-252199.txt --output data/10-Q/1000045/1000045_20170809.html
curl https://www.sec.gov/Archives/edgar/data/1000045/0001193125-18-248315.txt --output data/10-Q/1000045/1000045_20180814.html
curl https://www.sec.gov/Archives/edgar/data/1000045/0001193125-18-037381.txt --output data/10-Q/1000045/1000045_20180209.html
curl https://www.sec.gov/Archives/edgar/data/1000045/0001193125-17-035551.txt --output data/10-Q/1000045/1000045_20170209.html


#### 10-K

In [9]:
filing_10k_lst = idxDF.loc[idxDF['fileType']=='10-K'].reset_index()
print(filing_10k_lst)

   index     Code                    Name fileType  reportDate  \
0      0  1000045  NICHOLAS FINANCIAL INC     10-K  2017-06-14   
1     54  1000045  NICHOLAS FINANCIAL INC     10-K  2018-06-27   

                                      raw_path  \
0  edgar/data/1000045/0001193125-17-203193.txt   
1  edgar/data/1000045/0001193125-18-205637.txt   

                                                 url  
0  edgar/data/1000045/0001193125-17-203193-index....  
1  edgar/data/1000045/0001193125-18-205637-index....  


In [10]:
tmp_lst = filing_10k_lst.T.to_dict()
for i in tmp_lst:
    init_ = 'https://www.sec.gov/Archives'
    url_path = '/'.join([init_, tmp_lst[i]['raw_path']])
    date_ = tmp_lst[i]['reportDate'].replace('-', '')
    data_root = 'data/' + str(tmp_lst[i]['fileType'])
    data_folder = '/'.join([data_root, code])
    os.makedirs(data_folder, exist_ok=True)
    cmd = f'curl {url_path} --output {data_folder}/{code}_{date_}.html'
    print(cmd)

    process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
    output, error = process.communicate()

curl https://www.sec.gov/Archives/edgar/data/1000045/0001193125-17-203193.txt --output data/10-K/1000045/1000045_20170614.html
curl https://www.sec.gov/Archives/edgar/data/1000045/0001193125-18-205637.txt --output data/10-K/1000045/1000045_20180627.html
