In [8]:
import os
import edgar
import subprocess
import pandas as pd

In [2]:
# unload files into 'unload' folder
folder = 'unload'
startYear = '2017'
cmd = f'python run.py -y {startYear} -d {folder}'

process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
output, error = process.communicate()
print('\n'.join(output.decode("utf-8").split(' - ')))

2018-08-15 00:22:15,681
DEBUG
downloads will be saved to unload
2018-08-15 00:22:15,681
DEBUG
downloading files since 2017
2018-08-15 00:22:15,681
INFO
7 index files to retrieve
2018-08-15 00:22:15,681
DEBUG
worker count: 8
2018-08-15 00:22:17,228
INFO
> downloaded https://www.sec.gov/Archives/edgar/full-index/2017/QTR4/master.zip to unload/2017-QTR4.tsv
2018-08-15 00:22:17,514
INFO
> downloaded https://www.sec.gov/Archives/edgar/full-index/2018/QTR3/master.zip to unload/2018-QTR3.tsv
2018-08-15 00:22:17,638
INFO
> downloaded https://www.sec.gov/Archives/edgar/full-index/2017/QTR2/master.zip to unload/2017-QTR2.tsv
2018-08-15 00:22:17,821
INFO
> downloaded https://www.sec.gov/Archives/edgar/full-index/2018/QTR2/master.zip to unload/2018-QTR2.tsv
2018-08-15 00:22:18,001
INFO
> downloaded https://www.sec.gov/Archives/edgar/full-index/2018/QTR1/master.zip to unload/2018-QTR1.tsv
2018-08-15 00:22:18,038
INFO
> downloaded https://www.sec.gov/Archives/edgar/full-index/2017/QTR1/mast

In [3]:
# merge unloaded files into master index in 'mstrIdx' folder
mstr_folder = 'mstrIdx'
os.makedirs(mstr_folder, exist_ok=True)
# cmd = f'cat {folder}/*.tsv > {mstr_folder}/master.tsv'

file_lst = [x for x in os.listdir(folder) if x[-4:]=='.tsv']
with open(f'{mstr_folder}/master.tsv', 'w') as f:
    for fname in file_lst:
        with open(f'{folder}/{fname}') as infile:
            for line in infile:
                f.write(line)
        print(fname)

2017-QTR1.tsv
2017-QTR2.tsv
2017-QTR3.tsv
2017-QTR4.tsv
2018-QTR1.tsv
2018-QTR2.tsv
2018-QTR3.tsv


In [4]:
# Select security and find its index
code = '1000045'
idx_dir = 'idx'
os.makedirs(idx_dir, exist_ok=True)
cmd = f'grep -h {code} {folder}/*.tsv'
print(cmd)

process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
output, error = process.communicate()

with open(f'{idx_dir}/{code}.txt', 'wb') as f:
    f.write(output)

grep -h 1000045 unload/*.tsv


In [9]:
idxDF = pd.read_csv(f'{idx_dir}/{code}.txt', sep='|', header=None)
idxDF.columns = ['Code', 'Name', 'fileType', 'reportDate', 'raw_path', 'url']
print(idxDF.head())

      Code                    Name fileType  reportDate  \
0  1000045  NICHOLAS FINANCIAL INC     10-Q  2017-02-09   
1  1000045  NICHOLAS FINANCIAL INC        3  2017-01-05   
2  1000045  NICHOLAS FINANCIAL INC        4  2017-01-05   
3  1000045  NICHOLAS FINANCIAL INC        4  2017-01-09   
4  1000045  NICHOLAS FINANCIAL INC        4  2017-01-12   

                                      raw_path  \
0  edgar/data/1000045/0001193125-17-035551.txt   
1  edgar/data/1000045/0001398344-17-000151.txt   
2  edgar/data/1000045/0001398344-17-000154.txt   
3  edgar/data/1000045/0001398344-17-000309.txt   
4  edgar/data/1000045/0001398344-17-000516.txt   

                                                 url  
0  edgar/data/1000045/0001193125-17-035551-index....  
1  edgar/data/1000045/0001398344-17-000151-index....  
2  edgar/data/1000045/0001398344-17-000154-index....  
3  edgar/data/1000045/0001398344-17-000309-index....  
4  edgar/data/1000045/0001398344-17-000516-index....  


In [10]:
idxDF['fileType'].unique()

array(['10-Q', '3', '4', '8-K', 'CORRESP', 'SC 13D/A', 'SC 13D', 'SC 13G',
       'UPLOAD', '10-K', 'SC 13G/A', 'DEF 14A'], dtype=object)

In [18]:
filing_10q_lst = idxDF.loc[idxDF['fileType']=='10-Q'].reset_index()
print(filing_10q_lst)

   index     Code                    Name fileType  reportDate  \
0      0  1000045  NICHOLAS FINANCIAL INC     10-Q  2017-02-09   
1     41  1000045  NICHOLAS FINANCIAL INC     10-Q  2017-11-09   
2     53  1000045  NICHOLAS FINANCIAL INC     10-Q  2018-02-09   

                                      raw_path  \
0  edgar/data/1000045/0001193125-17-035551.txt   
1  edgar/data/1000045/0001193125-17-338757.txt   
2  edgar/data/1000045/0001193125-18-037381.txt   

                                                 url  
0  edgar/data/1000045/0001193125-17-035551-index....  
1  edgar/data/1000045/0001193125-17-338757-index....  
2  edgar/data/1000045/0001193125-18-037381-index....  


In [19]:
tmp_lst = filing_10q_lst.T.to_dict()
for i in tmp_lst:
    init_ = 'https://www.sec.gov/Archives'
    url_path = '/'.join([init_, tmp_lst[i]['raw_path']])
    date_ = tmp_lst[i]['reportDate'].replace('-', '')
    data_root = str(tmp_lst[i]['fileType'])
    data_folder = '/'.join([data_root, code])
    os.makedirs(data_folder, exist_ok=True)
    cmd = f'curl {url_path} --output {data_folder}/{code}_{date_}.html'
    print(cmd)

    process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
    output, error = process.communicate()

curl https://www.sec.gov/Archives/edgar/data/1000045/0001193125-17-035551.txt --output 10-Q/1000045/1000045_20170209.html
curl https://www.sec.gov/Archives/edgar/data/1000045/0001193125-17-338757.txt --output 10-Q/1000045/1000045_20171109.html
curl https://www.sec.gov/Archives/edgar/data/1000045/0001193125-18-037381.txt --output 10-Q/1000045/1000045_20180209.html
